Skip to content

Commit

Permalink
pre-commit automatic fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
pre-commit-ci[bot] committed Oct 5, 2021
1 parent c2b2515 commit cb91637
Show file tree
Hide file tree
Showing 42 changed files with 249 additions and 735 deletions.
8 changes: 2 additions & 6 deletions scripts/build_web_domains_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@ def main(args):
catalog_and_entity = os.path.split(file_in)[1].partition('_urls')[0]
file_out = f'{catalog_and_entity}_web_domains_table.mediawiki'
json_out = f'{catalog_and_entity}.json'
header = HEADER.replace(
'TARGET', catalog_and_entity.replace('_', ' ').title()
)
header = HEADER.replace('TARGET', catalog_and_entity.replace('_', ' ').title())
prefix = CATALOG_URL_PREFIXES.get(catalog_and_entity)

if prefix is None:
Expand Down Expand Up @@ -123,9 +121,7 @@ def main(args):
) in enumerate(examples, 1):
buffer.append(f'{i}. [{url} URL], [{prefix}{tid} record]; ')

fout.write(
ROW.format(domain=domain, freq=freq, examples=''.join(buffer))
)
fout.write(ROW.format(domain=domain, freq=freq, examples=''.join(buffer)))
fout.write(FOOTER)

return 0
Expand Down
19 changes: 6 additions & 13 deletions scripts/legacy/bne_baseline_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,7 @@ def temporary_wrapper():
bne_linked = csv.DictReader(open(HOME + 'bne/linked_people'))
linked_bne = {}
for row in bne_linked:
linked_bne[row['link']] = row['id'].replace(
'http:https://datos.bne.es/resource/', ''
)
linked_bne[row['link']] = row['id'].replace('http:https://datos.bne.es/resource/', '')

### Baseline matcher 2: cross-catalogs links
matched = defaultdict(list)
Expand All @@ -115,10 +113,7 @@ def temporary_wrapper():
### Baseline matcher 3: Wikipedia links
# BNE, DBpedia links
bbdb = filter(lambda x: 'dbpedia.org' in x, linked_bne)
dbp = {
x.replace('http:https://dbpedia.org/resource/', ''): linked_bne[x]
for x in bbdb
}
dbp = {x.replace('http:https://dbpedia.org/resource/', ''): linked_bne[x] for x in bbdb}

# Wikidata sample, site links
site_qid = json.load(open(HOME + 'wikidata/site2qid_1_percent_sample.json'))
Expand All @@ -136,9 +131,7 @@ def temporary_wrapper():
### Baseline matcher 4: name AND dates
# Wikidata sample, dates
dates_wd = {}
wd_dates = csv.DictReader(
open('dates_1_percent_sample.tsv'), delimiter='\t'
)
wd_dates = csv.DictReader(open('dates_1_percent_sample.tsv'), delimiter='\t')
for row in wd_dates:
qid = (
row['?person']
Expand All @@ -156,9 +149,9 @@ def temporary_wrapper():
dates_bne = {}
bne_labels = defaultdict(list)
for row in bne_names:
bne_labels[
row['id'].replace('http:https://datos.bne.es/resource/', '')
].append(row['name'].lower())
bne_labels[row['id'].replace('http:https://datos.bne.es/resource/', '')].append(
row['name'].lower()
)
for row in bne_dates:
ident = row['id'].replace('http:https://datos.bne.es/resource/', '')
for name in bne_labels[ident]:
Expand Down
20 changes: 5 additions & 15 deletions scripts/legacy/compute_mixnmatch_and_sqid_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@
'total_entries': int(mnm[db]['total']),
'in_wikidata': float(int(mnm[db]['manual']) / int(mnm[db]['total'])),
'unable_to_match': float(int(mnm[db]['noq']) / int(mnm[db]['total'])),
'matched_to_be_curated': float(
int(mnm[db]['autoq']) / int(mnm[db]['total'])
),
'matched_to_be_curated': float(int(mnm[db]['autoq']) / int(mnm[db]['total'])),
'url': mnm[db]['url'],
}
for db in mnm.keys()
Expand Down Expand Up @@ -59,9 +57,7 @@
)

# All SQID Wikidata properties
sqid = requests.get(
'https://tools.wmflabs.org/sqid/data/properties.json'
).json()
sqid = requests.get('https://tools.wmflabs.org/sqid/data/properties.json').json()
# SQID properties having external IDs as values
sqid_all = {
pid: {
Expand All @@ -78,12 +74,8 @@
mnm_people_with_pid = {
mnm[db]['wd_prop']: {
'mnm_total_db_entries': int(mnm[db]['total']),
'mnm_in_wikidata': float(
int(mnm[db]['manual']) / int(mnm[db]['total'])
),
'mnm_unable_to_match': float(
int(mnm[db]['noq']) / int(mnm[db]['total'])
),
'mnm_in_wikidata': float(int(mnm[db]['manual']) / int(mnm[db]['total'])),
'mnm_unable_to_match': float(int(mnm[db]['noq']) / int(mnm[db]['total'])),
'mnm_matched_to_be_curated': float(
int(mnm[db]['autoq']) / int(mnm[db]['total'])
),
Expand All @@ -109,9 +101,7 @@
)
)
by_mnm_entries = OrderedDict(
sorted(
final.items(), key=lambda x: x[1]['mnm_total_db_entries'], reverse=True
)
sorted(final.items(), key=lambda x: x[1]['mnm_total_db_entries'], reverse=True)
)
json.dump(
by_sqid_usage,
Expand Down
7 changes: 2 additions & 5 deletions scripts/legacy/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,9 @@

WD = '/Users/focs/wikidata/'

entities = [
l.rstrip() for l in open(WD + 'humans_1_percent_sample').readlines()
]
entities = [l.rstrip() for l in open(WD + 'humans_1_percent_sample').readlines()]
buckets = [
entities[i * 100 : (i + 1) * 100]
for i in range(0, int((len(entities) / 100 + 1)))
entities[i * 100 : (i + 1) * 100] for i in range(0, int((len(entities) / 100 + 1)))
]
with open(WD + 'dates_1_percent_sample.tsv', 'w') as o:
for b in buckets:
Expand Down
3 changes: 1 addition & 2 deletions scripts/legacy/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

entities = [l.rstrip() for l in open('1_percent_sample').readlines()]
buckets = [
entities[i * 100 : (i + 1) * 100]
for i in range(0, int((len(entities) / 100 + 1)))
entities[i * 100 : (i + 1) * 100] for i in range(0, int((len(entities) / 100 + 1)))
]
with open('linked_1_percent_sample.tsv', 'w') as o:
for b in buckets:
Expand Down
5 changes: 1 addition & 4 deletions scripts/legacy/query_on_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ def main(items_path, sparql_condition, output_path):

if __name__ == '__main__':
if len(argv) != 4:
print(
'Usage: python %s ITEMS_PATH SPARQL_CONSTRAINT OUTPUT_PATH'
% __file__
)
print('Usage: python %s ITEMS_PATH SPARQL_CONSTRAINT OUTPUT_PATH' % __file__)
exit(1)
exit(main(argv[1], argv[2], argv[3]))
8 changes: 2 additions & 6 deletions scripts/legacy/recordlinkage_first_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@
features = compare.compute(candidate_pairs, discogs_df, wikidata_df)
features
compare = recordlinkage.Compare()
compare.string(
'name', 'name', method='levenshtein', threshold=0.7, label='stocazzo'
)
compare.string('name', 'name', method='levenshtein', threshold=0.7, label='stocazzo')
features = compare.compute(candidate_pairs, discogs_df, wikidata_df)
features
discogs_df[304]
Expand Down Expand Up @@ -103,9 +101,7 @@
from recordlinkage.preprocessing import clean

wikidata
etichette = json.load(
open('/Users/focs/wikidata/label2qid_1_percent_sample.json')
)
etichette = json.load(open('/Users/focs/wikidata/label2qid_1_percent_sample.json'))
etichette
get_ipython().run_line_magic('pinfo', 'pandas.Series')
serie = pandas.Series(etichette)
Expand Down
21 changes: 5 additions & 16 deletions scripts/legacy/sample_additional_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,7 @@ def get_links_for_sample(sample_path, url_formatters, output):
formatters_dict[prop_id].replace('$1', id_row[col])
] = entity_id
else:
print(
'%s does not have an entry in the formatters file'
% col
)
print('%s does not have an entry in the formatters file' % col)

json.dump(url_id, open(filepath, 'w'), indent=2, ensure_ascii=False)

Expand Down Expand Up @@ -134,22 +131,16 @@ def get_birth_death_dates_for_sample(sample_path, output):
qid = get_wikidata_id_from_uri(date_row['?id'])
# creates the combination of all birth dates strings and all death dates strings
if date_row['?birth']:
for b in get_date_strings(
date_row['?birth'], date_row['?b_precision']
):
for b in get_date_strings(date_row['?birth'], date_row['?b_precision']):
if date_row['?death']:
for d in get_date_strings(
date_row['?death'], date_row['?d_precision']
):
labeldate_qid[
'%s|%s-%s' % (qid_labels[qid], b, d)
] = qid
labeldate_qid['%s|%s-%s' % (qid_labels[qid], b, d)] = qid
else:
labeldate_qid['%s|%s' % (qid_labels[qid], b)] = qid
else:
for d in get_date_strings(
date_row['?death'], date_row['?d_precision']
):
for d in get_date_strings(date_row['?death'], date_row['?d_precision']):
labeldate_qid['%s|-%s' % (qid_labels[qid], d)] = qid

json.dump(labeldate_qid, open(filepath, 'w'), indent=2, ensure_ascii=False)
Expand All @@ -166,9 +157,7 @@ def get_url_formatters_for_properties(property_mapping_path, output):

formatters = {}
for _, prop_id in properties.items():
query = (
"""SELECT * WHERE { wd:%s wdt:P1630 ?formatterUrl . }""" % prop_id
)
query = """SELECT * WHERE { wd:%s wdt:P1630 ?formatterUrl . }""" % prop_id
for r in _make_request(query):
formatters[prop_id] = r['?formatterUrl']

Expand Down
4 changes: 1 addition & 3 deletions scripts/legacy/sitelinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@
for qid in r['entities']:
entity = r['entities'][qid]
if entity.get('sitelinks'):
site_qid[
entity['sitelinks']['enwiki']['title'].replace(' ', '_')
] = qid
site_qid[entity['sitelinks']['enwiki']['title'].replace(' ', '_')] = qid

json.dump(
site_qid,
Expand Down
12 changes: 2 additions & 10 deletions scripts/legacy/sparql_templates.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,10 @@
from soweego.wikidata.sparql_queries import ITEM_BINDING, PROPERTY_BINDING

VALUES_QUERY_TEMPLATE = (
'SELECT * WHERE { VALUES '
+ ITEM_BINDING
+ ' { %s } . '
+ ITEM_BINDING
+ ' %s }'
'SELECT * WHERE { VALUES ' + ITEM_BINDING + ' { %s } . ' + ITEM_BINDING + ' %s }'
)
CATALOG_QID_QUERY_TEMPLATE = (
'SELECT '
+ ITEM_BINDING
+ ' WHERE { wd:%s wdt:P1629 '
+ ITEM_BINDING
+ ' . }'
'SELECT ' + ITEM_BINDING + ' WHERE { wd:%s wdt:P1629 ' + ITEM_BINDING + ' . }'
)
PROPERTIES_WITH_URL_DATATYPE_QUERY = (
'SELECT '
Expand Down
12 changes: 3 additions & 9 deletions scripts/linker/analyze_classification_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,7 @@
}
)

summaries = pd.DataFrame(summaries).sort_values(
by="Average Mean", ascending=False
)
summaries = pd.DataFrame(summaries).sort_values(by="Average Mean", ascending=False)

print(summaries.to_csv(index=False))

Expand Down Expand Up @@ -184,9 +182,7 @@
d["Prediction"].value_counts(normalize=True).reset_index()
)

dcounts = dcounts.rename(
columns={"index": "Value", "Prediction": "Counts"}
)
dcounts = dcounts.rename(columns={"index": "Value", "Prediction": "Counts"})
dcounts["Model"] = m
dcounts["Catalog/Entity"] = ce

Expand All @@ -195,6 +191,4 @@
else:
data = data.append(dcounts, ignore_index=True)

sns.barplot(
x="Value", y="Counts", data=data, hue="Model", ax=axes_binary[axi]
)
sns.barplot(x="Value", y="Counts", data=data, hue="Model", ax=axes_binary[axi])
16 changes: 5 additions & 11 deletions scripts/linker/extract_performances.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,11 @@
"Average Prec": "%.6f" % gg['Prec.Mean'].astype(float).mean(),
"Average Prec.STD": "%.6f" % gg['Prec.STD'].astype(float).mean(),
"Average Recall": "%.6f" % gg['Recall.Mean'].astype(float).mean(),
"Average Recall.STD": "%.6f"
% gg['Recall.STD'].astype(float).mean(),
"Average Recall.STD": "%.6f" % gg['Recall.STD'].astype(float).mean(),
}
)

summaries = pd.DataFrame(summaries).sort_values(
by="Average F1", ascending=False
)
summaries = pd.DataFrame(summaries).sort_values(by="Average F1", ascending=False)

print(summaries.to_csv(index=False))

Expand All @@ -168,12 +165,9 @@
"Average F1": "%.6f" % gg['F1.Mean'].astype(float).mean(),
"Average F1.STD": "%.6f" % gg['F1.STD'].astype(float).mean(),
"Average Prec": "%.6f" % gg['Prec.Mean'].astype(float).mean(),
"Average Prec.STD": "%.6f"
% gg['Prec.STD'].astype(float).mean(),
"Average Recall": "%.6f"
% gg['Recall.Mean'].astype(float).mean(),
"Average Recall.STD": "%.6f"
% gg['Recall.STD'].astype(float).mean(),
"Average Prec.STD": "%.6f" % gg['Prec.STD'].astype(float).mean(),
"Average Recall": "%.6f" % gg['Recall.Mean'].astype(float).mean(),
"Average Recall.STD": "%.6f" % gg['Recall.STD'].astype(float).mean(),
}
)

Expand Down
4 changes: 1 addition & 3 deletions soweego/commons/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,7 @@
SAMPLES = os.path.join(SAMPLES_DIR, SAMPLES_FILENAME)
FEATURES = os.path.join(FEATURES_DIR, FEATURES_FILENAME)
LINKER_MODEL = os.path.join(MODELS_DIR, MODEL_FILENAME)
LINKER_NESTED_CV_BEST_MODEL = os.path.join(
MODELS_DIR, NESTED_CV_BEST_MODEL_FILENAME
)
LINKER_NESTED_CV_BEST_MODEL = os.path.join(MODELS_DIR, NESTED_CV_BEST_MODEL_FILENAME)
LINKER_RESULT = os.path.join(RESULTS_DIR, RESULT_FILENAME)
LINKER_EVALUATION_PREDICTIONS = os.path.join(
RESULTS_DIR, EVALUATION_PREDICTIONS_FILENAME
Expand Down
Loading

0 comments on commit cb91637

Please sign in to comment.