Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Oct 5, 2021
1 parent 8288f67 commit eb96b91
Show file tree
Hide file tree
Showing 17 changed files with 395 additions and 210 deletions.
6 changes: 3 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# documentation root, use os.path.abspath to make it absolute, like shown here.
import os
import sys

sys.path.insert(0, os.path.abspath('..'))


Expand All @@ -34,7 +35,7 @@
'sphinx.ext.intersphinx',
'sphinx.ext.viewcode',
'sphinx_autodoc_typehints',
'sphinx_click.ext'
'sphinx_click.ext',
]

# Add any paths that contain templates here, relative to this directory.
Expand Down Expand Up @@ -80,7 +81,6 @@
'github_count': False,
# Header & footer
'show_powered_by': False,

}

# -- Extension configuration -------------------------------------------------
Expand All @@ -89,7 +89,7 @@
'python': ('https://docs.python.org/3/', None),
'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
'recordlinkage': ('https://recordlinkage.readthedocs.io/en/latest/', None),
'requests': ('https://requests.readthedocs.io/en/stable/', None),
'requests': ('https://requests.readthedocs.io/en/stable/', None),
'sqlalchemy': ('https://docs.sqlalchemy.org/en/13/', None),
'sklearn': ('https://scikit-learn.org/stable/', None),
'mlens': ('https://mlens.readthedocs.io/en/0.1.x/', None),
Expand Down
3 changes: 1 addition & 2 deletions scripts/basic_url_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import csv
import json
import sys
from collections import defaultdict, OrderedDict
from collections import OrderedDict, defaultdict
from urllib.parse import urlsplit


Expand Down Expand Up @@ -50,4 +50,3 @@ def main(args):

if __name__ == '__main__':
sys.exit(main(sys.argv))

38 changes: 26 additions & 12 deletions scripts/build_web_domains_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import json
import os
import sys
from collections import defaultdict, OrderedDict
from collections import OrderedDict, defaultdict
from random import sample
from urllib.parse import urlsplit

Expand All @@ -38,7 +38,7 @@
'discogs_musical_work': 'https://www.discogs.com/master/',
'musicbrainz_band': 'https://musicbrainz.org/artist/',
'musicbrainz_musician': 'https://musicbrainz.org/artist/',
'musicbrainz_musical_work': 'https://musicbrainz.org/release-group/'
'musicbrainz_musical_work': 'https://musicbrainz.org/release-group/',
}
WIKI_PROJECTS = (
'wikipedia',
Expand All @@ -59,16 +59,18 @@ def main(args):
if len(args) != 2:
print(
f"Usage: python {__file__} URLS_CSV\n"
"URLS_CSV file name must start with 'CATALOG_ENTITY_urls', "
"e.g., 'discogs_band_urls'"
"URLS_CSV file name must start with 'CATALOG_ENTITY_urls', "
"e.g., 'discogs_band_urls'"
)
return 1

file_in = args[1]
catalog_and_entity = os.path.split(file_in)[1].partition('_urls')[0]
file_out = f'{catalog_and_entity}_web_domains_table.mediawiki'
json_out = f'{catalog_and_entity}.json'
header = HEADER.replace('TARGET', catalog_and_entity.replace('_', ' ').title())
header = HEADER.replace(
'TARGET', catalog_and_entity.replace('_', ' ').title()
)
prefix = CATALOG_URL_PREFIXES.get(catalog_and_entity)

if prefix is None:
Expand All @@ -81,13 +83,23 @@ def main(args):

with open(file_in) as fin:
r = csv.reader(fin)
for (_, _, url, tid,) in r:
for (
_,
_,
url,
tid,
) in r:
domain = urlsplit(url).netloc
if any(wiki_project in domain for wiki_project in WIKI_PROJECTS):
wiki_urls += 1
continue
freq[domain] += 1
urls[domain].append((url, tid,))
urls[domain].append(
(
url,
tid,
)
)

print(f'Total wiki URLs found: {wiki_urls}')

Expand All @@ -105,17 +117,19 @@ def main(args):

examples = sample(urls[domain], N_EXAMPLES)
buffer = []
for i, (url, tid,) in enumerate(examples, 1):
for i, (
url,
tid,
) in enumerate(examples, 1):
buffer.append(f'{i}. [{url} URL], [{prefix}{tid} record]; ')

fout.write(ROW.format(
domain=domain, freq=freq, examples=''.join(buffer)
))
fout.write(
ROW.format(domain=domain, freq=freq, examples=''.join(buffer))
)
fout.write(FOOTER)

return 0


if __name__ == '__main__':
sys.exit(main(sys.argv))

12 changes: 8 additions & 4 deletions scripts/delete_claims.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2021, Hjfocs'

import requests
import sys

import requests

WIKIDATA_API_URL = 'https://www.wikidata.org/w/api.php'
STMT_PREFIX = 'https://www.wikidata.org/entity/statement/'

Expand Down Expand Up @@ -57,8 +58,12 @@ def main(args):
# Fire a POST for each GUID
for guid in guids:
data = {
'action': 'wbremoveclaims', 'format': 'json', 'token': token,
'bot': True, 'claim': guid, 'summary': summary
'action': 'wbremoveclaims',
'format': 'json',
'token': token,
'bot': True,
'claim': guid,
'summary': summary,
}
r = session.post(WIKIDATA_API_URL, data=data)

Expand All @@ -70,4 +75,3 @@ def main(args):

if __name__ == '__main__':
sys.exit(main(sys.argv))

102 changes: 75 additions & 27 deletions scripts/legacy/bne_baseline_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@

import csv
import json

from collections import defaultdict

HOME = '/Users/focs/'

# TODO resolve Click issues
# TODO resolve Click issues
def temporary_wrapper():
# Wikidata sample, labels
qid_labels = json.load(open(HOME + 'wikidata/final_1_percent_sample.json'))
Expand All @@ -19,34 +18,51 @@ def temporary_wrapper():
for l in labels:
label_qid[l.lower()] = qid
# Better sample with labels as keys
json.dump(label_qid, open(HOME + 'wikidata/label2qid_1_percent_sample.json', 'w'), indent=2, ensure_ascii=False)
json.dump(
label_qid,
open(HOME + 'wikidata/label2qid_1_percent_sample.json', 'w'),
indent=2,
ensure_ascii=False,
)

# BNE, name labels
label_bne = {}
bne_names = csv.DictReader(open(HOME + 'bne/all_people_ids_and_names.csv'))
for row in bne_names:
label_bne[row['name'].lower()] = row['id'].replace('https://datos.bne.es/resource/', '')
label_bne[row['name'].lower()] = row['id'].replace(
'https://datos.bne.es/resource/', ''
)

# BNE, 'also known as' labels
aka_bne = {}
bne_aka = csv.DictReader(open(HOME + 'bne/aka_people'))
for row in bne_aka:
aka_bne[row['aka'].lower()] = row['id'].replace('https://datos.bne.es/resource/', '')
aka_bne[row['aka'].lower()] = row['id'].replace(
'https://datos.bne.es/resource/', ''
)

### Baseline matcher 1: perfect strings
# Perfect matches against BNE names
matched = defaultdict(list)
for d in (label_qid, label_bne):
for k,v in d.items():
for k, v in d.items():
matched[k].append(v)
json.dump({v[0]: v[1] for v in matched.values() if len(v) > 1}, open('perfect_matches.json', 'w'), indent=2)
json.dump(
{v[0]: v[1] for v in matched.values() if len(v) > 1},
open('perfect_matches.json', 'w'),
indent=2,
)

# Perfect matches against BNE AKA
matched = defaultdict(list)
for d in (label_qid, aka_bne):
for k,v in d.items():
matched[k].append(v)
json.dump({v[0]: v[1] for v in matched.values() if len(v) > 1}, open('aka_perfect_matches.json', 'w'), indent=2)
for k, v in d.items():
matched[k].append(v)
json.dump(
{v[0]: v[1] for v in matched.values() if len(v) > 1},
open('aka_perfect_matches.json', 'w'),
indent=2,
)

# Links available in BNE
isni = 'https://isni-url.oclc.nl/isni/'
Expand All @@ -57,9 +73,15 @@ def temporary_wrapper():

# Wikidata sample, links
linked_wd = {}
wd_linked = csv.DictReader(open(HOME + 'wikidata/linked_1_percent_sample.tsv'), delimiter='\t')
wd_linked = csv.DictReader(
open(HOME + 'wikidata/linked_1_percent_sample.tsv'), delimiter='\t'
)
for row in wd_linked:
qid = row['?person'].replace('<https://www.wikidata.org/entity/', '').replace('>', '')
qid = (
row['?person']
.replace('<https://www.wikidata.org/entity/', '')
.replace('>', '')
)
if row.get('?viaf'):
linked_wd[viaf + row['?viaf']] = qid
if row.get('?isni'):
Expand All @@ -75,34 +97,54 @@ def temporary_wrapper():
bne_linked = csv.DictReader(open(HOME + 'bne/linked_people'))
linked_bne = {}
for row in bne_linked:
linked_bne[row['link']] = row['id'].replace('https://datos.bne.es/resource/', '')

linked_bne[row['link']] = row['id'].replace(
'https://datos.bne.es/resource/', ''
)

### Baseline matcher 2: cross-catalogs links
matched = defaultdict(list)
for d in (linked_wd, linked_bne):
for k,v in d.items():
matched[k].append(v)
json.dump({v[0]: v[1] for v in matched.values() if len(v) > 1}, open('link_matches.json', 'w'), indent=2)
for k, v in d.items():
matched[k].append(v)
json.dump(
{v[0]: v[1] for v in matched.values() if len(v) > 1},
open('link_matches.json', 'w'),
indent=2,
)

### Baseline matcher 3: Wikipedia links
# BNE, DBpedia links
bbdb = filter(lambda x: 'dbpedia.org' in x, linked_bne)
dbp = {x.replace('https://dbpedia.org/resource/', ''): linked_bne[x] for x in bbdb}
dbp = {
x.replace('https://dbpedia.org/resource/', ''): linked_bne[x]
for x in bbdb
}

# Wikidata sample, site links
site_qid = json.load(open(HOME + 'wikidata/site2qid_1_percent_sample.json'))
matched = defaultdict(list)
for d in (site_qid, dbp):
for k,v in d.items():
matched[k].append(v)
json.dump({v[0]: v[1] for v in matched.values() if len(v) > 1}, open('enwiki__matches.json', 'w'), indent=2, ensure_ascii=False)
for k, v in d.items():
matched[k].append(v)
json.dump(
{v[0]: v[1] for v in matched.values() if len(v) > 1},
open('enwiki__matches.json', 'w'),
indent=2,
ensure_ascii=False,
)

### Baseline matcher 4: name AND dates
# Wikidata sample, dates
dates_wd = {}
wd_dates = csv.DictReader(open('dates_1_percent_sample.tsv'), delimiter='\t')
wd_dates = csv.DictReader(
open('dates_1_percent_sample.tsv'), delimiter='\t'
)
for row in wd_dates:
qid = row['?person'].replace('<https://www.wikidata.org/entity/', '').replace('>', '')
qid = (
row['?person']
.replace('<https://www.wikidata.org/entity/', '')
.replace('>', '')
)
for label in qid_labels[qid].keys():
name_and_date = label + '|' + row['?birth'][:4] + '-'
if row.get('?death'):
Expand All @@ -114,7 +156,9 @@ def temporary_wrapper():
dates_bne = {}
bne_labels = defaultdict(list)
for row in bne_names:
bne_labels[row['id'].replace('https://datos.bne.es/resource/', '')].append(row['name'].lower())
bne_labels[
row['id'].replace('https://datos.bne.es/resource/', '')
].append(row['name'].lower())
for row in bne_dates:
ident = row['id'].replace('https://datos.bne.es/resource/', '')
for name in bne_labels[ident]:
Expand All @@ -126,7 +170,11 @@ def temporary_wrapper():

matched = defaultdict(list)
for d in (dates_wd, dates_bne):
for k,v in d.items():
for k, v in d.items():
matched[k].append(v)
json.dump({v[0]: v[1] for v in matched.values() if len(v) > 1}, open('name_and_date_matches.json', 'w'), indent=2, ensure_ascii=False)

json.dump(
{v[0]: v[1] for v in matched.values() if len(v) > 1},
open('name_and_date_matches.json', 'w'),
indent=2,
ensure_ascii=False,
)
2 changes: 1 addition & 1 deletion scripts/legacy/extract_imdb_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@
else:
no_morte += 1
dates[''.join(sb)] = ide

json.dump(dates, open('dates_id.json', 'w'), indent=2, ensure_ascii=False)
Loading

0 comments on commit eb96b91

Please sign in to comment.