pre-commit automatic fixes

Wikidata · Oct 5, 2021 · cb91637 · cb91637
1 parent c2b2515
commit cb91637
Show file tree

Hide file tree

Showing 42 changed files with 249 additions and 735 deletions.
diff --git a/scripts/build_web_domains_table.py b/scripts/build_web_domains_table.py
@@ -68,9 +68,7 @@ def main(args):
  catalog_and_entity = os.path.split(file_in)[1].partition('_urls')[0]
  file_out = f'{catalog_and_entity}_web_domains_table.mediawiki'
  json_out = f'{catalog_and_entity}.json'
- header = HEADER.replace(
- 'TARGET', catalog_and_entity.replace('_', ' ').title()
- )
+ header = HEADER.replace('TARGET', catalog_and_entity.replace('_', ' ').title())
  prefix = CATALOG_URL_PREFIXES.get(catalog_and_entity)
 
  if prefix is None:
@@ -123,9 +121,7 @@ def main(args):
  ) in enumerate(examples, 1):
  buffer.append(f'{i}. [{url} URL], [{prefix}{tid} record]; ')
 
- fout.write(
- ROW.format(domain=domain, freq=freq, examples=''.join(buffer))
- )
+ fout.write(ROW.format(domain=domain, freq=freq, examples=''.join(buffer)))
  fout.write(FOOTER)
 
  return 0

diff --git a/scripts/legacy/bne_baseline_matcher.py b/scripts/legacy/bne_baseline_matcher.py
@@ -97,9 +97,7 @@ def temporary_wrapper():
  bne_linked = csv.DictReader(open(HOME + 'bne/linked_people'))
  linked_bne = {}
  for row in bne_linked:
- linked_bne[row['link']] = row['id'].replace(
- 'http:https://datos.bne.es/resource/', ''
- )
+ linked_bne[row['link']] = row['id'].replace('http:https://datos.bne.es/resource/', '')
 
  ### Baseline matcher 2: cross-catalogs links
  matched = defaultdict(list)
@@ -115,10 +113,7 @@ def temporary_wrapper():
  ### Baseline matcher 3: Wikipedia links
  # BNE, DBpedia links
  bbdb = filter(lambda x: 'dbpedia.org' in x, linked_bne)
- dbp = {
- x.replace('http:https://dbpedia.org/resource/', ''): linked_bne[x]
- for x in bbdb
- }
+ dbp = {x.replace('http:https://dbpedia.org/resource/', ''): linked_bne[x] for x in bbdb}
 
  # Wikidata sample, site links
  site_qid = json.load(open(HOME + 'wikidata/site2qid_1_percent_sample.json'))
@@ -136,9 +131,7 @@ def temporary_wrapper():
  ### Baseline matcher 4: name AND dates
  # Wikidata sample, dates
  dates_wd = {}
- wd_dates = csv.DictReader(
- open('dates_1_percent_sample.tsv'), delimiter='\t'
- )
+ wd_dates = csv.DictReader(open('dates_1_percent_sample.tsv'), delimiter='\t')
  for row in wd_dates:
  qid = (
  row['?person']
@@ -156,9 +149,9 @@ def temporary_wrapper():
  dates_bne = {}
  bne_labels = defaultdict(list)
  for row in bne_names:
- bne_labels[
- row['id'].replace('http:https://datos.bne.es/resource/', '')
- ].append(row['name'].lower())
+ bne_labels[row['id'].replace('http:https://datos.bne.es/resource/', '')].append(
+ row['name'].lower()
+ )
  for row in bne_dates:
  ident = row['id'].replace('http:https://datos.bne.es/resource/', '')
  for name in bne_labels[ident]:

diff --git a/scripts/legacy/compute_mixnmatch_and_sqid_stats.py b/scripts/legacy/compute_mixnmatch_and_sqid_stats.py
@@ -21,9 +21,7 @@
  'total_entries': int(mnm[db]['total']),
  'in_wikidata': float(int(mnm[db]['manual']) / int(mnm[db]['total'])),
  'unable_to_match': float(int(mnm[db]['noq']) / int(mnm[db]['total'])),
- 'matched_to_be_curated': float(
- int(mnm[db]['autoq']) / int(mnm[db]['total'])
- ),
+ 'matched_to_be_curated': float(int(mnm[db]['autoq']) / int(mnm[db]['total'])),
  'url': mnm[db]['url'],
  }
  for db in mnm.keys()
@@ -59,9 +57,7 @@
 )
 
 # All SQID Wikidata properties
-sqid = requests.get(
- 'https://tools.wmflabs.org/sqid/data/properties.json'
-).json()
+sqid = requests.get('https://tools.wmflabs.org/sqid/data/properties.json').json()
 # SQID properties having external IDs as values
 sqid_all = {
  pid: {
@@ -78,12 +74,8 @@
 mnm_people_with_pid = {
  mnm[db]['wd_prop']: {
  'mnm_total_db_entries': int(mnm[db]['total']),
- 'mnm_in_wikidata': float(
- int(mnm[db]['manual']) / int(mnm[db]['total'])
- ),
- 'mnm_unable_to_match': float(
- int(mnm[db]['noq']) / int(mnm[db]['total'])
- ),
+ 'mnm_in_wikidata': float(int(mnm[db]['manual']) / int(mnm[db]['total'])),
+ 'mnm_unable_to_match': float(int(mnm[db]['noq']) / int(mnm[db]['total'])),
  'mnm_matched_to_be_curated': float(
  int(mnm[db]['autoq']) / int(mnm[db]['total'])
  ),
@@ -109,9 +101,7 @@
  )
 )
 by_mnm_entries = OrderedDict(
- sorted(
- final.items(), key=lambda x: x[1]['mnm_total_db_entries'], reverse=True
- )
+ sorted(final.items(), key=lambda x: x[1]['mnm_total_db_entries'], reverse=True)
 )
 json.dump(
  by_sqid_usage,

diff --git a/scripts/legacy/dates.py b/scripts/legacy/dates.py
@@ -5,12 +5,9 @@
 
 WD = '/Users/focs/wikidata/'
 
-entities = [
- l.rstrip() for l in open(WD + 'humans_1_percent_sample').readlines()
-]
+entities = [l.rstrip() for l in open(WD + 'humans_1_percent_sample').readlines()]
 buckets = [
- entities[i * 100 : (i + 1) * 100]
- for i in range(0, int((len(entities) / 100 + 1)))
+ entities[i * 100 : (i + 1) * 100] for i in range(0, int((len(entities) / 100 + 1)))
 ]
 with open(WD + 'dates_1_percent_sample.tsv', 'w') as o:
  for b in buckets:

diff --git a/scripts/legacy/identifiers.py b/scripts/legacy/identifiers.py
@@ -3,8 +3,7 @@
 
 entities = [l.rstrip() for l in open('1_percent_sample').readlines()]
 buckets = [
- entities[i * 100 : (i + 1) * 100]
- for i in range(0, int((len(entities) / 100 + 1)))
+ entities[i * 100 : (i + 1) * 100] for i in range(0, int((len(entities) / 100 + 1)))
 ]
 with open('linked_1_percent_sample.tsv', 'w') as o:
  for b in buckets:

diff --git a/scripts/legacy/query_on_values.py b/scripts/legacy/query_on_values.py
@@ -33,9 +33,6 @@ def main(items_path, sparql_condition, output_path):
 
 if __name__ == '__main__':
  if len(argv) != 4:
- print(
- 'Usage: python %s ITEMS_PATH SPARQL_CONSTRAINT OUTPUT_PATH'
- % __file__
- )
+ print('Usage: python %s ITEMS_PATH SPARQL_CONSTRAINT OUTPUT_PATH' % __file__)
  exit(1)
  exit(main(argv[1], argv[2], argv[3]))
diff --git a/scripts/legacy/recordlinkage_first_trial.py b/scripts/legacy/recordlinkage_first_trial.py
@@ -63,9 +63,7 @@
 features = compare.compute(candidate_pairs, discogs_df, wikidata_df)
 features
 compare = recordlinkage.Compare()
-compare.string(
- 'name', 'name', method='levenshtein', threshold=0.7, label='stocazzo'
-)
+compare.string('name', 'name', method='levenshtein', threshold=0.7, label='stocazzo')
 features = compare.compute(candidate_pairs, discogs_df, wikidata_df)
 features
 discogs_df[304]
@@ -103,9 +101,7 @@
 from recordlinkage.preprocessing import clean
 
 wikidata
-etichette = json.load(
- open('/Users/focs/wikidata/label2qid_1_percent_sample.json')
-)
+etichette = json.load(open('/Users/focs/wikidata/label2qid_1_percent_sample.json'))
 etichette
 get_ipython().run_line_magic('pinfo', 'pandas.Series')
 serie = pandas.Series(etichette)

diff --git a/scripts/legacy/sample_additional_info.py b/scripts/legacy/sample_additional_info.py
@@ -85,10 +85,7 @@ def get_links_for_sample(sample_path, url_formatters, output):
  formatters_dict[prop_id].replace('$1', id_row[col])
  ] = entity_id
  else:
- print(
- '%s does not have an entry in the formatters file'
- % col
- )
+ print('%s does not have an entry in the formatters file' % col)
 
  json.dump(url_id, open(filepath, 'w'), indent=2, ensure_ascii=False)
 
@@ -134,22 +131,16 @@ def get_birth_death_dates_for_sample(sample_path, output):
  qid = get_wikidata_id_from_uri(date_row['?id'])
  # creates the combination of all birth dates strings and all death dates strings
  if date_row['?birth']:
- for b in get_date_strings(
- date_row['?birth'], date_row['?b_precision']
- ):
+ for b in get_date_strings(date_row['?birth'], date_row['?b_precision']):
  if date_row['?death']:
  for d in get_date_strings(
  date_row['?death'], date_row['?d_precision']
  ):
- labeldate_qid[
- '%s|%s-%s' % (qid_labels[qid], b, d)
- ] = qid
+ labeldate_qid['%s|%s-%s' % (qid_labels[qid], b, d)] = qid
  else:
  labeldate_qid['%s|%s' % (qid_labels[qid], b)] = qid
  else:
- for d in get_date_strings(
- date_row['?death'], date_row['?d_precision']
- ):
+ for d in get_date_strings(date_row['?death'], date_row['?d_precision']):
  labeldate_qid['%s|-%s' % (qid_labels[qid], d)] = qid
 
  json.dump(labeldate_qid, open(filepath, 'w'), indent=2, ensure_ascii=False)
@@ -166,9 +157,7 @@ def get_url_formatters_for_properties(property_mapping_path, output):
 
  formatters = {}
  for _, prop_id in properties.items():
- query = (
- """SELECT * WHERE { wd:%s wdt:P1630 ?formatterUrl . }""" % prop_id
- )
+ query = """SELECT * WHERE { wd:%s wdt:P1630 ?formatterUrl . }""" % prop_id
  for r in _make_request(query):
  formatters[prop_id] = r['?formatterUrl']
 

diff --git a/scripts/legacy/sitelinks.py b/scripts/legacy/sitelinks.py
@@ -35,9 +35,7 @@
  for qid in r['entities']:
  entity = r['entities'][qid]
  if entity.get('sitelinks'):
- site_qid[
- entity['sitelinks']['enwiki']['title'].replace(' ', '_')
- ] = qid
+ site_qid[entity['sitelinks']['enwiki']['title'].replace(' ', '_')] = qid
 
 json.dump(
  site_qid,

diff --git a/scripts/legacy/sparql_templates.py b/scripts/legacy/sparql_templates.py
@@ -1,18 +1,10 @@
 from soweego.wikidata.sparql_queries import ITEM_BINDING, PROPERTY_BINDING
 
 VALUES_QUERY_TEMPLATE = (
- 'SELECT * WHERE { VALUES '
- + ITEM_BINDING
- + ' { %s } . '
- + ITEM_BINDING
- + ' %s }'
+ 'SELECT * WHERE { VALUES ' + ITEM_BINDING + ' { %s } . ' + ITEM_BINDING + ' %s }'
 )
 CATALOG_QID_QUERY_TEMPLATE = (
- 'SELECT '
- + ITEM_BINDING
- + ' WHERE { wd:%s wdt:P1629 '
- + ITEM_BINDING
- + ' . }'
+ 'SELECT ' + ITEM_BINDING + ' WHERE { wd:%s wdt:P1629 ' + ITEM_BINDING + ' . }'
 )
 PROPERTIES_WITH_URL_DATATYPE_QUERY = (
  'SELECT '

diff --git a/scripts/linker/analyze_classification_links.py b/scripts/linker/analyze_classification_links.py
@@ -112,9 +112,7 @@
  }
  )
 
-summaries = pd.DataFrame(summaries).sort_values(
- by="Average Mean", ascending=False
-)
+summaries = pd.DataFrame(summaries).sort_values(by="Average Mean", ascending=False)
 
 print(summaries.to_csv(index=False))
 
@@ -184,9 +182,7 @@
  d["Prediction"].value_counts(normalize=True).reset_index()
  )
 
- dcounts = dcounts.rename(
- columns={"index": "Value", "Prediction": "Counts"}
- )
+ dcounts = dcounts.rename(columns={"index": "Value", "Prediction": "Counts"})
  dcounts["Model"] = m
  dcounts["Catalog/Entity"] = ce
 
@@ -195,6 +191,4 @@
  else:
  data = data.append(dcounts, ignore_index=True)
 
- sns.barplot(
- x="Value", y="Counts", data=data, hue="Model", ax=axes_binary[axi]
- )
+ sns.barplot(x="Value", y="Counts", data=data, hue="Model", ax=axes_binary[axi])
diff --git a/scripts/linker/extract_performances.py b/scripts/linker/extract_performances.py
@@ -144,14 +144,11 @@
  "Average Prec": "%.6f" % gg['Prec.Mean'].astype(float).mean(),
  "Average Prec.STD": "%.6f" % gg['Prec.STD'].astype(float).mean(),
  "Average Recall": "%.6f" % gg['Recall.Mean'].astype(float).mean(),
- "Average Recall.STD": "%.6f"
- % gg['Recall.STD'].astype(float).mean(),
+ "Average Recall.STD": "%.6f" % gg['Recall.STD'].astype(float).mean(),
  }
  )
 
-summaries = pd.DataFrame(summaries).sort_values(
- by="Average F1", ascending=False
-)
+summaries = pd.DataFrame(summaries).sort_values(by="Average F1", ascending=False)
 
 print(summaries.to_csv(index=False))
 
@@ -168,12 +165,9 @@
  "Average F1": "%.6f" % gg['F1.Mean'].astype(float).mean(),
  "Average F1.STD": "%.6f" % gg['F1.STD'].astype(float).mean(),
  "Average Prec": "%.6f" % gg['Prec.Mean'].astype(float).mean(),
- "Average Prec.STD": "%.6f"
- % gg['Prec.STD'].astype(float).mean(),
- "Average Recall": "%.6f"
- % gg['Recall.Mean'].astype(float).mean(),
- "Average Recall.STD": "%.6f"
- % gg['Recall.STD'].astype(float).mean(),
+ "Average Prec.STD": "%.6f" % gg['Prec.STD'].astype(float).mean(),
+ "Average Recall": "%.6f" % gg['Recall.Mean'].astype(float).mean(),
+ "Average Recall.STD": "%.6f" % gg['Recall.STD'].astype(float).mean(),
  }
  )
 

diff --git a/soweego/commons/constants.py b/soweego/commons/constants.py
@@ -109,9 +109,7 @@
 SAMPLES = os.path.join(SAMPLES_DIR, SAMPLES_FILENAME)
 FEATURES = os.path.join(FEATURES_DIR, FEATURES_FILENAME)
 LINKER_MODEL = os.path.join(MODELS_DIR, MODEL_FILENAME)
-LINKER_NESTED_CV_BEST_MODEL = os.path.join(
- MODELS_DIR, NESTED_CV_BEST_MODEL_FILENAME
-)
+LINKER_NESTED_CV_BEST_MODEL = os.path.join(MODELS_DIR, NESTED_CV_BEST_MODEL_FILENAME)
 LINKER_RESULT = os.path.join(RESULTS_DIR, RESULT_FILENAME)
 LINKER_EVALUATION_PREDICTIONS = os.path.join(
  RESULTS_DIR, EVALUATION_PREDICTIONS_FILENAME