From cb91637ff60aa38f291a110935e726a323488260 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Oct 2021 14:19:08 +0000 Subject: [PATCH] pre-commit automatic fixes --- scripts/build_web_domains_table.py | 8 +- scripts/legacy/bne_baseline_matcher.py | 19 +-- .../compute_mixnmatch_and_sqid_stats.py | 20 +-- scripts/legacy/dates.py | 7 +- scripts/legacy/identifiers.py | 3 +- scripts/legacy/query_on_values.py | 5 +- scripts/legacy/recordlinkage_first_trial.py | 8 +- scripts/legacy/sample_additional_info.py | 21 +-- scripts/legacy/sitelinks.py | 4 +- scripts/legacy/sparql_templates.py | 12 +- .../linker/analyze_classification_links.py | 12 +- scripts/linker/extract_performances.py | 16 +-- soweego/commons/constants.py | 4 +- soweego/commons/data_gathering.py | 46 ++---- soweego/commons/http_client.py | 4 +- soweego/commons/localizations.py | 4 +- soweego/commons/text_utils.py | 8 +- soweego/commons/url_utils.py | 20 +-- soweego/importer/base_dump_extractor.py | 4 +- soweego/importer/discogs_dump_extractor.py | 55 ++----- soweego/importer/imdb_dump_extractor.py | 28 +--- soweego/importer/importer.py | 20 +-- soweego/importer/models/base_entity.py | 13 +- soweego/importer/models/base_link_entity.py | 4 +- soweego/importer/models/base_nlp_entity.py | 4 +- soweego/importer/models/musicbrainz_entity.py | 4 +- .../importer/musicbrainz_dump_extractor.py | 68 +++------ soweego/ingester/mix_n_match_client.py | 20 +-- soweego/ingester/wikidata_bot.py | 55 ++----- soweego/linker/baseline.py | 40 ++---- soweego/linker/blocking.py | 12 +- soweego/linker/classifiers.py | 12 +- soweego/linker/evaluate.py | 50 ++----- soweego/linker/features.py | 45 ++---- soweego/linker/link.py | 28 +--- soweego/linker/train.py | 12 +- soweego/linker/workflow.py | 36 ++--- soweego/pipeline.py | 16 +-- soweego/validator/checks.py | 135 +++++------------- soweego/validator/enrichment.py | 24 +--- soweego/wikidata/api_requests.py | 55 ++----- soweego/wikidata/sparql_queries.py | 23 +-- 42 files changed, 249 insertions(+), 735 deletions(-) diff --git a/scripts/build_web_domains_table.py b/scripts/build_web_domains_table.py index 46f0f3df..77e14872 100644 --- a/scripts/build_web_domains_table.py +++ b/scripts/build_web_domains_table.py @@ -68,9 +68,7 @@ def main(args): catalog_and_entity = os.path.split(file_in)[1].partition('_urls')[0] file_out = f'{catalog_and_entity}_web_domains_table.mediawiki' json_out = f'{catalog_and_entity}.json' - header = HEADER.replace( - 'TARGET', catalog_and_entity.replace('_', ' ').title() - ) + header = HEADER.replace('TARGET', catalog_and_entity.replace('_', ' ').title()) prefix = CATALOG_URL_PREFIXES.get(catalog_and_entity) if prefix is None: @@ -123,9 +121,7 @@ def main(args): ) in enumerate(examples, 1): buffer.append(f'{i}. [{url} URL], [{prefix}{tid} record]; ') - fout.write( - ROW.format(domain=domain, freq=freq, examples=''.join(buffer)) - ) + fout.write(ROW.format(domain=domain, freq=freq, examples=''.join(buffer))) fout.write(FOOTER) return 0 diff --git a/scripts/legacy/bne_baseline_matcher.py b/scripts/legacy/bne_baseline_matcher.py index c74e794a..52913a3c 100644 --- a/scripts/legacy/bne_baseline_matcher.py +++ b/scripts/legacy/bne_baseline_matcher.py @@ -97,9 +97,7 @@ def temporary_wrapper(): bne_linked = csv.DictReader(open(HOME + 'bne/linked_people')) linked_bne = {} for row in bne_linked: - linked_bne[row['link']] = row['id'].replace( - 'http://datos.bne.es/resource/', '' - ) + linked_bne[row['link']] = row['id'].replace('http://datos.bne.es/resource/', '') ### Baseline matcher 2: cross-catalogs links matched = defaultdict(list) @@ -115,10 +113,7 @@ def temporary_wrapper(): ### Baseline matcher 3: Wikipedia links # BNE, DBpedia links bbdb = filter(lambda x: 'dbpedia.org' in x, linked_bne) - dbp = { - x.replace('http://dbpedia.org/resource/', ''): linked_bne[x] - for x in bbdb - } + dbp = {x.replace('http://dbpedia.org/resource/', ''): linked_bne[x] for x in bbdb} # Wikidata sample, site links site_qid = json.load(open(HOME + 'wikidata/site2qid_1_percent_sample.json')) @@ -136,9 +131,7 @@ def temporary_wrapper(): ### Baseline matcher 4: name AND dates # Wikidata sample, dates dates_wd = {} - wd_dates = csv.DictReader( - open('dates_1_percent_sample.tsv'), delimiter='\t' - ) + wd_dates = csv.DictReader(open('dates_1_percent_sample.tsv'), delimiter='\t') for row in wd_dates: qid = ( row['?person'] @@ -156,9 +149,9 @@ def temporary_wrapper(): dates_bne = {} bne_labels = defaultdict(list) for row in bne_names: - bne_labels[ - row['id'].replace('http://datos.bne.es/resource/', '') - ].append(row['name'].lower()) + bne_labels[row['id'].replace('http://datos.bne.es/resource/', '')].append( + row['name'].lower() + ) for row in bne_dates: ident = row['id'].replace('http://datos.bne.es/resource/', '') for name in bne_labels[ident]: diff --git a/scripts/legacy/compute_mixnmatch_and_sqid_stats.py b/scripts/legacy/compute_mixnmatch_and_sqid_stats.py index ab1eb7e8..35930790 100644 --- a/scripts/legacy/compute_mixnmatch_and_sqid_stats.py +++ b/scripts/legacy/compute_mixnmatch_and_sqid_stats.py @@ -21,9 +21,7 @@ 'total_entries': int(mnm[db]['total']), 'in_wikidata': float(int(mnm[db]['manual']) / int(mnm[db]['total'])), 'unable_to_match': float(int(mnm[db]['noq']) / int(mnm[db]['total'])), - 'matched_to_be_curated': float( - int(mnm[db]['autoq']) / int(mnm[db]['total']) - ), + 'matched_to_be_curated': float(int(mnm[db]['autoq']) / int(mnm[db]['total'])), 'url': mnm[db]['url'], } for db in mnm.keys() @@ -59,9 +57,7 @@ ) # All SQID Wikidata properties -sqid = requests.get( - 'https://tools.wmflabs.org/sqid/data/properties.json' -).json() +sqid = requests.get('https://tools.wmflabs.org/sqid/data/properties.json').json() # SQID properties having external IDs as values sqid_all = { pid: { @@ -78,12 +74,8 @@ mnm_people_with_pid = { mnm[db]['wd_prop']: { 'mnm_total_db_entries': int(mnm[db]['total']), - 'mnm_in_wikidata': float( - int(mnm[db]['manual']) / int(mnm[db]['total']) - ), - 'mnm_unable_to_match': float( - int(mnm[db]['noq']) / int(mnm[db]['total']) - ), + 'mnm_in_wikidata': float(int(mnm[db]['manual']) / int(mnm[db]['total'])), + 'mnm_unable_to_match': float(int(mnm[db]['noq']) / int(mnm[db]['total'])), 'mnm_matched_to_be_curated': float( int(mnm[db]['autoq']) / int(mnm[db]['total']) ), @@ -109,9 +101,7 @@ ) ) by_mnm_entries = OrderedDict( - sorted( - final.items(), key=lambda x: x[1]['mnm_total_db_entries'], reverse=True - ) + sorted(final.items(), key=lambda x: x[1]['mnm_total_db_entries'], reverse=True) ) json.dump( by_sqid_usage, diff --git a/scripts/legacy/dates.py b/scripts/legacy/dates.py index 8ff8fdad..9dfb5c52 100644 --- a/scripts/legacy/dates.py +++ b/scripts/legacy/dates.py @@ -5,12 +5,9 @@ WD = '/Users/focs/wikidata/' -entities = [ - l.rstrip() for l in open(WD + 'humans_1_percent_sample').readlines() -] +entities = [l.rstrip() for l in open(WD + 'humans_1_percent_sample').readlines()] buckets = [ - entities[i * 100 : (i + 1) * 100] - for i in range(0, int((len(entities) / 100 + 1))) + entities[i * 100 : (i + 1) * 100] for i in range(0, int((len(entities) / 100 + 1))) ] with open(WD + 'dates_1_percent_sample.tsv', 'w') as o: for b in buckets: diff --git a/scripts/legacy/identifiers.py b/scripts/legacy/identifiers.py index 4e98f827..c36d3f48 100644 --- a/scripts/legacy/identifiers.py +++ b/scripts/legacy/identifiers.py @@ -3,8 +3,7 @@ entities = [l.rstrip() for l in open('1_percent_sample').readlines()] buckets = [ - entities[i * 100 : (i + 1) * 100] - for i in range(0, int((len(entities) / 100 + 1))) + entities[i * 100 : (i + 1) * 100] for i in range(0, int((len(entities) / 100 + 1))) ] with open('linked_1_percent_sample.tsv', 'w') as o: for b in buckets: diff --git a/scripts/legacy/query_on_values.py b/scripts/legacy/query_on_values.py index 1123f0be..058df006 100644 --- a/scripts/legacy/query_on_values.py +++ b/scripts/legacy/query_on_values.py @@ -33,9 +33,6 @@ def main(items_path, sparql_condition, output_path): if __name__ == '__main__': if len(argv) != 4: - print( - 'Usage: python %s ITEMS_PATH SPARQL_CONSTRAINT OUTPUT_PATH' - % __file__ - ) + print('Usage: python %s ITEMS_PATH SPARQL_CONSTRAINT OUTPUT_PATH' % __file__) exit(1) exit(main(argv[1], argv[2], argv[3])) diff --git a/scripts/legacy/recordlinkage_first_trial.py b/scripts/legacy/recordlinkage_first_trial.py index 0aad315c..00431df9 100644 --- a/scripts/legacy/recordlinkage_first_trial.py +++ b/scripts/legacy/recordlinkage_first_trial.py @@ -63,9 +63,7 @@ features = compare.compute(candidate_pairs, discogs_df, wikidata_df) features compare = recordlinkage.Compare() -compare.string( - 'name', 'name', method='levenshtein', threshold=0.7, label='stocazzo' -) +compare.string('name', 'name', method='levenshtein', threshold=0.7, label='stocazzo') features = compare.compute(candidate_pairs, discogs_df, wikidata_df) features discogs_df[304] @@ -103,9 +101,7 @@ from recordlinkage.preprocessing import clean wikidata -etichette = json.load( - open('/Users/focs/wikidata/label2qid_1_percent_sample.json') -) +etichette = json.load(open('/Users/focs/wikidata/label2qid_1_percent_sample.json')) etichette get_ipython().run_line_magic('pinfo', 'pandas.Series') serie = pandas.Series(etichette) diff --git a/scripts/legacy/sample_additional_info.py b/scripts/legacy/sample_additional_info.py index 96d02c9b..4180960e 100644 --- a/scripts/legacy/sample_additional_info.py +++ b/scripts/legacy/sample_additional_info.py @@ -85,10 +85,7 @@ def get_links_for_sample(sample_path, url_formatters, output): formatters_dict[prop_id].replace('$1', id_row[col]) ] = entity_id else: - print( - '%s does not have an entry in the formatters file' - % col - ) + print('%s does not have an entry in the formatters file' % col) json.dump(url_id, open(filepath, 'w'), indent=2, ensure_ascii=False) @@ -134,22 +131,16 @@ def get_birth_death_dates_for_sample(sample_path, output): qid = get_wikidata_id_from_uri(date_row['?id']) # creates the combination of all birth dates strings and all death dates strings if date_row['?birth']: - for b in get_date_strings( - date_row['?birth'], date_row['?b_precision'] - ): + for b in get_date_strings(date_row['?birth'], date_row['?b_precision']): if date_row['?death']: for d in get_date_strings( date_row['?death'], date_row['?d_precision'] ): - labeldate_qid[ - '%s|%s-%s' % (qid_labels[qid], b, d) - ] = qid + labeldate_qid['%s|%s-%s' % (qid_labels[qid], b, d)] = qid else: labeldate_qid['%s|%s' % (qid_labels[qid], b)] = qid else: - for d in get_date_strings( - date_row['?death'], date_row['?d_precision'] - ): + for d in get_date_strings(date_row['?death'], date_row['?d_precision']): labeldate_qid['%s|-%s' % (qid_labels[qid], d)] = qid json.dump(labeldate_qid, open(filepath, 'w'), indent=2, ensure_ascii=False) @@ -166,9 +157,7 @@ def get_url_formatters_for_properties(property_mapping_path, output): formatters = {} for _, prop_id in properties.items(): - query = ( - """SELECT * WHERE { wd:%s wdt:P1630 ?formatterUrl . }""" % prop_id - ) + query = """SELECT * WHERE { wd:%s wdt:P1630 ?formatterUrl . }""" % prop_id for r in _make_request(query): formatters[prop_id] = r['?formatterUrl'] diff --git a/scripts/legacy/sitelinks.py b/scripts/legacy/sitelinks.py index 4bd29996..c3e42c02 100644 --- a/scripts/legacy/sitelinks.py +++ b/scripts/legacy/sitelinks.py @@ -35,9 +35,7 @@ for qid in r['entities']: entity = r['entities'][qid] if entity.get('sitelinks'): - site_qid[ - entity['sitelinks']['enwiki']['title'].replace(' ', '_') - ] = qid + site_qid[entity['sitelinks']['enwiki']['title'].replace(' ', '_')] = qid json.dump( site_qid, diff --git a/scripts/legacy/sparql_templates.py b/scripts/legacy/sparql_templates.py index d336d5dc..37b8ec35 100644 --- a/scripts/legacy/sparql_templates.py +++ b/scripts/legacy/sparql_templates.py @@ -1,18 +1,10 @@ from soweego.wikidata.sparql_queries import ITEM_BINDING, PROPERTY_BINDING VALUES_QUERY_TEMPLATE = ( - 'SELECT * WHERE { VALUES ' - + ITEM_BINDING - + ' { %s } . ' - + ITEM_BINDING - + ' %s }' + 'SELECT * WHERE { VALUES ' + ITEM_BINDING + ' { %s } . ' + ITEM_BINDING + ' %s }' ) CATALOG_QID_QUERY_TEMPLATE = ( - 'SELECT ' - + ITEM_BINDING - + ' WHERE { wd:%s wdt:P1629 ' - + ITEM_BINDING - + ' . }' + 'SELECT ' + ITEM_BINDING + ' WHERE { wd:%s wdt:P1629 ' + ITEM_BINDING + ' . }' ) PROPERTIES_WITH_URL_DATATYPE_QUERY = ( 'SELECT ' diff --git a/scripts/linker/analyze_classification_links.py b/scripts/linker/analyze_classification_links.py index 70e51057..c87205a0 100644 --- a/scripts/linker/analyze_classification_links.py +++ b/scripts/linker/analyze_classification_links.py @@ -112,9 +112,7 @@ } ) -summaries = pd.DataFrame(summaries).sort_values( - by="Average Mean", ascending=False -) +summaries = pd.DataFrame(summaries).sort_values(by="Average Mean", ascending=False) print(summaries.to_csv(index=False)) @@ -184,9 +182,7 @@ d["Prediction"].value_counts(normalize=True).reset_index() ) - dcounts = dcounts.rename( - columns={"index": "Value", "Prediction": "Counts"} - ) + dcounts = dcounts.rename(columns={"index": "Value", "Prediction": "Counts"}) dcounts["Model"] = m dcounts["Catalog/Entity"] = ce @@ -195,6 +191,4 @@ else: data = data.append(dcounts, ignore_index=True) - sns.barplot( - x="Value", y="Counts", data=data, hue="Model", ax=axes_binary[axi] - ) + sns.barplot(x="Value", y="Counts", data=data, hue="Model", ax=axes_binary[axi]) diff --git a/scripts/linker/extract_performances.py b/scripts/linker/extract_performances.py index a154ced6..39a6da0b 100644 --- a/scripts/linker/extract_performances.py +++ b/scripts/linker/extract_performances.py @@ -144,14 +144,11 @@ "Average Prec": "%.6f" % gg['Prec.Mean'].astype(float).mean(), "Average Prec.STD": "%.6f" % gg['Prec.STD'].astype(float).mean(), "Average Recall": "%.6f" % gg['Recall.Mean'].astype(float).mean(), - "Average Recall.STD": "%.6f" - % gg['Recall.STD'].astype(float).mean(), + "Average Recall.STD": "%.6f" % gg['Recall.STD'].astype(float).mean(), } ) -summaries = pd.DataFrame(summaries).sort_values( - by="Average F1", ascending=False -) +summaries = pd.DataFrame(summaries).sort_values(by="Average F1", ascending=False) print(summaries.to_csv(index=False)) @@ -168,12 +165,9 @@ "Average F1": "%.6f" % gg['F1.Mean'].astype(float).mean(), "Average F1.STD": "%.6f" % gg['F1.STD'].astype(float).mean(), "Average Prec": "%.6f" % gg['Prec.Mean'].astype(float).mean(), - "Average Prec.STD": "%.6f" - % gg['Prec.STD'].astype(float).mean(), - "Average Recall": "%.6f" - % gg['Recall.Mean'].astype(float).mean(), - "Average Recall.STD": "%.6f" - % gg['Recall.STD'].astype(float).mean(), + "Average Prec.STD": "%.6f" % gg['Prec.STD'].astype(float).mean(), + "Average Recall": "%.6f" % gg['Recall.Mean'].astype(float).mean(), + "Average Recall.STD": "%.6f" % gg['Recall.STD'].astype(float).mean(), } ) diff --git a/soweego/commons/constants.py b/soweego/commons/constants.py index 43144fae..b23c7666 100644 --- a/soweego/commons/constants.py +++ b/soweego/commons/constants.py @@ -109,9 +109,7 @@ SAMPLES = os.path.join(SAMPLES_DIR, SAMPLES_FILENAME) FEATURES = os.path.join(FEATURES_DIR, FEATURES_FILENAME) LINKER_MODEL = os.path.join(MODELS_DIR, MODEL_FILENAME) -LINKER_NESTED_CV_BEST_MODEL = os.path.join( - MODELS_DIR, NESTED_CV_BEST_MODEL_FILENAME -) +LINKER_NESTED_CV_BEST_MODEL = os.path.join(MODELS_DIR, NESTED_CV_BEST_MODEL_FILENAME) LINKER_RESULT = os.path.join(RESULTS_DIR, RESULT_FILENAME) LINKER_EVALUATION_PREDICTIONS = os.path.join( RESULTS_DIR, EVALUATION_PREDICTIONS_FILENAME diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py index efe50bb0..31077515 100644 --- a/soweego/commons/data_gathering.py +++ b/soweego/commons/data_gathering.py @@ -21,13 +21,7 @@ from sqlalchemy import or_ from tqdm import tqdm -from soweego.commons import ( - constants, - keys, - target_database, - text_utils, - url_utils, -) +from soweego.commons import constants, keys, target_database, text_utils, url_utils from soweego.commons.db_manager import DBManager from soweego.importer import models from soweego.wikidata import api_requests, sparql_queries, vocabulary @@ -35,9 +29,7 @@ LOGGER = logging.getLogger(__name__) -def gather_target_biodata( - entity: str, catalog: str -) -> Optional[Iterator[tuple]]: +def gather_target_biodata(entity: str, catalog: str) -> Optional[Iterator[tuple]]: LOGGER.info( 'Gathering %s birth/death dates/places and gender metadata ...', catalog ) @@ -83,11 +75,7 @@ def tokens_fulltext_search( raise ValueError('Bad target entity class: %s' % target_entity) tokens = filter(None, tokens) - terms = ( - ' '.join(map('+{0}'.format, tokens)) - if boolean_mode - else ' '.join(tokens) - ) + terms = ' '.join(map('+{0}'.format, tokens)) if boolean_mode else ' '.join(tokens) ft_search = column.match(terms) session = DBManager.connect_to_db() @@ -144,9 +132,7 @@ def perfect_name_search( session = DBManager.connect_to_db() try: for r in ( - session.query(target_entity) - .filter(target_entity.name == to_search) - .all() + session.query(target_entity).filter(target_entity.name == to_search).all() ): yield r @@ -163,9 +149,7 @@ def perfect_name_search_bucket( session = DBManager.connect_to_db() try: for r in ( - session.query(target_entity) - .filter(target_entity.name.in_(to_search)) - .all() + session.query(target_entity).filter(target_entity.name.in_(to_search)).all() ): yield r @@ -246,9 +230,7 @@ def _run_query(query, catalog, entity_type, page=1000): "No data available for %s %s. Stopping here", catalog, entity_type ) return None - LOGGER.info( - 'Got %d internal IDs with data from %s %s', count, catalog, entity_type - ) + LOGGER.info('Got %d internal IDs with data from %s %s', count, catalog, entity_type) return query.yield_per(page).enable_eagerloads(False) @@ -269,15 +251,11 @@ def _build_biodata_query_fields(entity, entity_type, catalog): if hasattr(entity, 'birth_place'): query_fields.append(entity.birth_place) else: - LOGGER.info( - '%s %s has no birth place information', catalog, entity_type - ) + LOGGER.info('%s %s has no birth place information', catalog, entity_type) if hasattr(entity, 'death_place'): query_fields.append(entity.death_place) else: - LOGGER.info( - '%s %s has no death place information', catalog, entity_type - ) + LOGGER.info('%s %s has no death place information', catalog, entity_type) return query_fields @@ -410,9 +388,7 @@ def gather_wikidata_biodata(wikidata): timestamp, precision = parsed[0], parsed[1] # Get rid of time, useless timestamp = timestamp.split('T')[0] - wikidata[qid][keys.BIODATA].append( - (pid, f'{timestamp}/{precision}') - ) + wikidata[qid][keys.BIODATA].append((pid, f'{timestamp}/{precision}')) else: wikidata[qid][keys.BIODATA].append((pid, parsed)) total += 1 @@ -485,9 +461,7 @@ def _compile(regexp, id_or_url): def gather_target_ids(entity, catalog, catalog_pid, aggregated): - LOGGER.info( - 'Gathering Wikidata %s items with %s identifiers ...', entity, catalog - ) + LOGGER.info('Gathering Wikidata %s items with %s identifiers ...', entity, catalog) query_type = keys.IDENTIFIER, constants.SUPPORTED_ENTITIES.get(entity) diff --git a/soweego/commons/http_client.py b/soweego/commons/http_client.py index f13501fb..8ea4e873 100644 --- a/soweego/commons/http_client.py +++ b/soweego/commons/http_client.py @@ -37,9 +37,7 @@ def download_file(url, filePath): """Downloads a web content and saves it in a custom filePath""" try: file_size = int(requests.head(url).headers["Content-Length"]) - pbar = tqdm( - total=file_size, unit='B', unit_scale=True, desc=url.split('/')[-1] - ) + pbar = tqdm(total=file_size, unit='B', unit_scale=True, desc=url.split('/')[-1]) stream = requests.get(url, stream=True, verify=False) with open(filePath, 'wb') as f: diff --git a/soweego/commons/localizations.py b/soweego/commons/localizations.py index 8923ba80..8431b365 100644 --- a/soweego/commons/localizations.py +++ b/soweego/commons/localizations.py @@ -10,8 +10,6 @@ ) FAIL_DOWNLOAD = 'Fails on dump download' FAIL_HANDLER = 'Handler fails on dump scraping' -MALFORMED_ROW = ( - 'Malformed Row, brokes the structure ' -) +MALFORMED_ROW = 'Malformed Row, brokes the structure ' FIELD_NOT_MAPPED = 'Field: \t %s \t not mapped' WRONG_MAPPINGS = 'Errors at DB import, probably due to wrong mappings \n %s' diff --git a/soweego/commons/text_utils.py b/soweego/commons/text_utils.py index 70a29ed1..9711cfc5 100644 --- a/soweego/commons/text_utils.py +++ b/soweego/commons/text_utils.py @@ -17,9 +17,7 @@ # Adapted from http://snowball.tartarus.org/algorithms/english/stop.txt STOPWORDS_ENG = frozenset( - str( - get_data('soweego.commons.resources', 'stopwords_eng.txt'), 'utf8' - ).splitlines() + str(get_data('soweego.commons.resources', 'stopwords_eng.txt'), 'utf8').splitlines() ) COMMON_WORDS_ENG = frozenset( str( @@ -34,9 +32,7 @@ ) BAND_NAME_LOW_SCORE_WORDS = frozenset( - str( - get_data('soweego.commons.resources', 'band_low_score_words.txt') - ).splitlines() + str(get_data('soweego.commons.resources', 'band_low_score_words.txt')).splitlines() ) STOPWORDS_URL_TOKENS = frozenset( diff --git a/soweego/commons/url_utils.py b/soweego/commons/url_utils.py index f9269181..270434a3 100644 --- a/soweego/commons/url_utils.py +++ b/soweego/commons/url_utils.py @@ -61,7 +61,9 @@ def clean(url): def validate(url): ul = '\u00a1-\uffff' # Unicode letters range (must not be a raw string) # IP patterns - ipv4_re = r'(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)(?:\.(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3}' + ipv4_re = ( + r'(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)(?:\.(?:25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3}' + ) ipv6_re = r'\[[0-9a-f:\.]+\]' # Host patterns hostname_re = ( @@ -93,9 +95,7 @@ def validate(url): LOGGER.debug('Dropping invalid URL: <%s>', url) return None if not valid_url.group(1): - LOGGER.debug( - "Adding 'https' to potential URL with missing scheme: <%s>", url - ) + LOGGER.debug("Adding 'https' to potential URL with missing scheme: <%s>", url) return 'https://' + valid_url.group() return valid_url.group() @@ -116,9 +116,7 @@ def resolve(url: str) -> Optional[str]: } try: # Some Web sites do not accept the HEAD method: fire a GET, but don't download anything - response = get( - url, headers=browser_ua, stream=True, timeout=READ_TIMEOUT - ) + response = get(url, headers=browser_ua, stream=True, timeout=READ_TIMEOUT) except requests.exceptions.SSLError as ssl_error: LOGGER.debug( 'SSL certificate verification failed, will retry without verification. Original URL: <%s> - Reason: %s', @@ -189,9 +187,7 @@ def tokenize(url, domain_only=False) -> set: try: split = urlsplit(url) except ValueError as value_error: - LOGGER.warning( - 'Invalid URL: %s. Reason: %s', url, value_error, exc_info=1 - ) + LOGGER.warning('Invalid URL: %s. Reason: %s', url, value_error, exc_info=1) return None domain_tokens = set(re.split(r'\W+', split.netloc)) domain_tokens.difference_update(TOP_LEVEL_DOMAINS, DOMAIN_PREFIXES) @@ -335,7 +331,5 @@ def get_external_id_from_url(url, ext_id_pids_to_urls): def is_wiki_link(url): domain = urlsplit(url).netloc return ( - True - if any(wiki_project in domain for wiki_project in WIKI_PROJECTS) - else False + True if any(wiki_project in domain for wiki_project in WIKI_PROJECTS) else False ) diff --git a/soweego/importer/base_dump_extractor.py b/soweego/importer/base_dump_extractor.py index 5973f2d8..a891db46 100644 --- a/soweego/importer/base_dump_extractor.py +++ b/soweego/importer/base_dump_extractor.py @@ -26,9 +26,7 @@ class BaseDumpExtractor: populate a database instance. """ - def extract_and_populate( - self, dump_file_paths: List[str], resolve: bool - ) -> None: + def extract_and_populate(self, dump_file_paths: List[str], resolve: bool) -> None: """Extract relevant data and populate `SQLAlchemy `_ ORM entities accordingly. Entities will be then persisted to a database instance. diff --git a/soweego/importer/discogs_dump_extractor.py b/soweego/importer/discogs_dump_extractor.py index 3c95d6d4..81cc404e 100644 --- a/soweego/importer/discogs_dump_extractor.py +++ b/soweego/importer/discogs_dump_extractor.py @@ -83,9 +83,7 @@ def get_dump_download_urls(self) -> Optional[List[str]]: return None return urls - def extract_and_populate( - self, dump_file_paths: List[str], resolve: bool - ) -> None: + def extract_and_populate(self, dump_file_paths: List[str], resolve: bool) -> None: """Extract relevant data from the *artists* (people) and *masters* (works) Discogs dumps, preprocess them, populate `SQLAlchemy `_ ORM entities, and persist @@ -101,9 +99,7 @@ def extract_and_populate( self._process_masters_dump(dump_file_paths[1]) def _process_masters_dump(self, dump_file_path): - LOGGER.info( - "Starting import of masters from Discogs dump '%s'", dump_file_path - ) + LOGGER.info("Starting import of masters from Discogs dump '%s'", dump_file_path) start = datetime.now() tables = [DiscogsMasterEntity, DiscogsMasterArtistRelationship] db_manager = DBManager() @@ -124,9 +120,7 @@ def _process_masters_dump(self, dump_file_path): shutil.copyfileobj(f_in, f_out) # count number of entries - n_rows = sum( - 1 for _ in self._g_process_et_items(extracted_path, 'master') - ) + n_rows = sum(1 for _ in self._g_process_et_items(extracted_path, 'master')) session = db_manager.new_session() entity_array = [] # array to which we'll add the entities relationships_set = set() @@ -176,8 +170,7 @@ def _process_masters_dump(self, dump_file_path): end = datetime.now() LOGGER.info( - 'Import completed in %s. Total entities: %d. ' - 'Total relationships %s.', + 'Import completed in %s. Total entities: %d. ' 'Total relationships %s.', end - start, self.total_entities, len(relationships_set), @@ -217,9 +210,7 @@ def _extract_from_master_node(node, relationships_set): ) elif child.tag == 'artists': for artist in child: - relationships_set.add( - (entity.catalog_id, artist.find('id').text) - ) + relationships_set.add((entity.catalog_id, artist.find('id').text)) entity.genres = ' '.join(genres) return entity @@ -257,9 +248,7 @@ def _extract_from_artist_node(self, node, resolve: bool) -> dict: infos['profile'] = node.findtext('profile') infos['namevariations'] = node.find('namevariations') - infos['living_links'] = self._extract_living_links( - identifier, node, resolve - ) + infos['living_links'] = self._extract_living_links(identifier, node, resolve) return infos @@ -295,9 +284,7 @@ def _process_artists_dump(self, dump_file_path, resolve): shutil.copyfileobj(f_in, f_out) # count number of entries - n_rows = sum( - 1 for _ in self._g_process_et_items(extracted_path, 'artist') - ) + n_rows = sum(1 for _ in self._g_process_et_items(extracted_path, 'artist')) session = db_manager.new_session() entity_array = [] # array to which we'll add the entities for _, node in tqdm( @@ -363,9 +350,7 @@ def _process_artists_dump(self, dump_file_path, resolve): # we can safely delete the extracted discogs dump os.remove(extracted_path) - def _populate_band( - self, entity_array, entity: DiscogsGroupEntity, infos: dict - ): + def _populate_band(self, entity_array, entity: DiscogsGroupEntity, infos: dict): # Main entity self._fill_entity(entity, infos) self.bands += 1 @@ -409,9 +394,7 @@ def _populate_links(self, entity_array, entity_class, infos: dict): self._fill_link_entity(link_entity, infos['identifier'], link) entity_array.append(link_entity) - def _populate_name_variations( - self, entity_array, infos: dict, current_entity - ): + def _populate_name_variations(self, entity_array, infos: dict, current_entity): identifier = infos['identifier'] if infos.get('namevariations') is not None: children = list(infos['namevariations']) @@ -421,9 +404,7 @@ def _populate_name_variations( ): entity_array.append(entity) else: - LOGGER.debug( - 'Artist %s has an empty tag', identifier - ) + LOGGER.debug('Artist %s has an empty tag', identifier) else: LOGGER.debug('Artist %s has no tag', identifier) @@ -442,9 +423,7 @@ def _populate_nlp_entity(self, entity_array, infos: dict, entity_class): else: self.band_nlp += 1 else: - LOGGER.debug( - 'Artist %s has an empty tag', infos['identifier'] - ) + LOGGER.debug('Artist %s has an empty tag', infos['identifier']) @staticmethod def _fill_entity(entity: DiscogsArtistEntity, infos): @@ -459,9 +438,7 @@ def _fill_entity(entity: DiscogsArtistEntity, infos): if real_name: entity.real_name = real_name else: - LOGGER.debug( - 'Artist %s has an empty tag', infos['identifier'] - ) + LOGGER.debug('Artist %s has an empty tag', infos['identifier']) # Data quality data_quality = infos['data_quality'] if data_quality: @@ -506,9 +483,7 @@ def _extract_living_links(self, identifier, node, resolve: bool): for url_element in urls.iterfind('url'): url = url_element.text if not url: - LOGGER.debug( - 'Artist %s: skipping empty tag', identifier - ) + LOGGER.debug('Artist %s: skipping empty tag', identifier) continue for alive_link in self._check_link(url, resolve): yield alive_link @@ -551,9 +526,7 @@ def _g_process_et_items(path, tag) -> Iterable[Tuple]: efficient way """ - context: etree.ElementTree = etree.iterparse( - path, events=('end',), tag=tag - ) + context: etree.ElementTree = etree.iterparse(path, events=('end',), tag=tag) for event, elem in context: yield event, elem diff --git a/soweego/importer/imdb_dump_extractor.py b/soweego/importer/imdb_dump_extractor.py index f573faea..be251ab1 100644 --- a/soweego/importer/imdb_dump_extractor.py +++ b/soweego/importer/imdb_dump_extractor.py @@ -67,9 +67,7 @@ def _normalize_null(entity: Dict) -> None: if value == '\\N': entity[key] = None - def extract_and_populate( - self, dump_file_paths: List[str], resolve: bool - ) -> None: + def extract_and_populate(self, dump_file_paths: List[str], resolve: bool) -> None: """Extract relevant data from the *name* (people) and *title* (works) IMDb dumps, preprocess them, populate `SQLAlchemy `_ ORM entities, and persist @@ -115,9 +113,7 @@ def extract_and_populate( LOGGER.info('Starting import of movies ...') # Here we open the movie dump file, and add everything to the DB - for movie_info, entity_array in self._loop_through_entities( - movies_file_path - ): + for movie_info, entity_array in self._loop_through_entities(movies_file_path): # create the movie SQLAlchemy entity and populate it movie_entity = imdb_entity.IMDbTitleEntity() @@ -128,9 +124,7 @@ def extract_and_populate( movie_entity.name_tokens = ' '.join( text_utils.tokenize(movie_info.get('primaryTitle')) ) - movie_entity.is_adult = ( - True if movie_info.get('isAdult') == '1' else False - ) + movie_entity.is_adult = True if movie_info.get('isAdult') == '1' else False try: movie_entity.born = datetime.date( year=int(movie_info.get('startYear')), month=1, day=1 @@ -177,9 +171,7 @@ def extract_and_populate( # reset timer for persons import start = datetime.datetime.now() - for person_info, entity_array in self._loop_through_entities( - person_file_path - ): + for person_info, entity_array in self._loop_through_entities(person_file_path): # IMDb saves the list of professions as a comma separated # string @@ -187,9 +179,7 @@ def extract_and_populate( # if person has no professions then ignore it if not professions: - LOGGER.debug( - 'Person %s has no professions', person_info.get('nconst') - ) + LOGGER.debug('Person %s has no professions', person_info.get('nconst')) continue professions = professions.split(',') @@ -359,9 +349,7 @@ def _populate_person( person_entity.catalog_id = person_info.get('nconst') person_entity.name = person_info.get('primaryName') - person_entity.name_tokens = ' '.join( - text_utils.tokenize(person_entity.name) - ) + person_entity.name_tokens = ' '.join(text_utils.tokenize(person_entity.name)) # If either `actor` or `actress` in primary profession # (which is a comma separated string of professions) @@ -371,9 +359,7 @@ def _populate_person( for prof in ['actor', 'actress'] ): person_entity.gender = ( - 'male' - if 'actor' in person_info.get('primaryProfession') - else 'female' + 'male' if 'actor' in person_info.get('primaryProfession') else 'female' ) # IMDb only provides us with the birth and death year of diff --git a/soweego/importer/importer.py b/soweego/importer/importer.py index df3f9c16..2cdb904e 100644 --- a/soweego/importer/importer.py +++ b/soweego/importer/importer.py @@ -39,9 +39,7 @@ @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) @click.option( '--url-check', is_flag=True, @@ -65,9 +63,7 @@ def import_cli(catalog: str, url_check: bool, dir_io: str) -> None: @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) @click.option( '-d', '--drop', @@ -116,9 +112,7 @@ def check_urls_cli(catalog, drop, dir_io): try: # Resolve every URL for resolved, result in tqdm( - pool.imap_unordered( - _resolve, query_session.query(link_entity) - ), + pool.imap_unordered(_resolve, query_session.query(link_entity)), total=total, ): if not resolved: @@ -209,12 +203,8 @@ def refresh_dump( last_modified, '%a, %d %b %Y %H:%M:%S GMT' ).strftime('%Y%m%d_%H%M%S') except TypeError: - LOGGER.info( - "Last modified not available, using now as replacement" - ) - last_modified = datetime.datetime.now().strftime( - '%Y%m%d_%H%M%S' - ) + LOGGER.info("Last modified not available, using now as replacement") + last_modified = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') extensions = download_url.split('/')[-1].split('.')[1:] diff --git a/soweego/importer/models/base_entity.py b/soweego/importer/models/base_entity.py index 950233d1..a64e3dcf 100644 --- a/soweego/importer/models/base_entity.py +++ b/soweego/importer/models/base_entity.py @@ -41,9 +41,7 @@ class BaseEntity(AbstractConcreteBase, BASE): __tablename__ = None - internal_id = Column( - Integer, unique=True, primary_key=True, autoincrement=True - ) + internal_id = Column(Integer, unique=True, primary_key=True, autoincrement=True) # Catalog identifier, indexed catalog_id = Column(String(50), nullable=False, index=True) # Full name @@ -72,10 +70,7 @@ def __table_args__(cls): ) def __repr__(self) -> str: - return ( - f'' - ) + return f'' class BaseRelationship(AbstractConcreteBase, BASE): @@ -94,9 +89,7 @@ class BaseRelationship(AbstractConcreteBase, BASE): """ __tablename__ = None - internal_id = Column( - Integer, unique=True, primary_key=True, autoincrement=True - ) + internal_id = Column(Integer, unique=True, primary_key=True, autoincrement=True) from_catalog_id = Column(String(50), nullable=False, index=False) to_catalog_id = Column(String(50), nullable=False, index=False) diff --git a/soweego/importer/models/base_link_entity.py b/soweego/importer/models/base_link_entity.py index 8b990f57..0530ab02 100644 --- a/soweego/importer/models/base_link_entity.py +++ b/soweego/importer/models/base_link_entity.py @@ -35,9 +35,7 @@ class BaseLinkEntity(AbstractConcreteBase, BASE): """ __tablename__ = None - internal_id = Column( - Integer, unique=True, primary_key=True, autoincrement=True - ) + internal_id = Column(Integer, unique=True, primary_key=True, autoincrement=True) # Catalog identifier of the entity having the link, indexed catalog_id = Column(String(50), nullable=False, index=True) # Full URL diff --git a/soweego/importer/models/base_nlp_entity.py b/soweego/importer/models/base_nlp_entity.py index a9067d6c..bb6986dc 100644 --- a/soweego/importer/models/base_nlp_entity.py +++ b/soweego/importer/models/base_nlp_entity.py @@ -35,9 +35,7 @@ class BaseNlpEntity(AbstractConcreteBase, BASE): """ __tablename__ = None - internal_id = Column( - Integer, unique=True, primary_key=True, autoincrement=True - ) + internal_id = Column(Integer, unique=True, primary_key=True, autoincrement=True) # Catalog identifier of the entity with textual data, indexed catalog_id = Column(String(50), nullable=False, index=True) # Original text diff --git a/soweego/importer/models/musicbrainz_entity.py b/soweego/importer/models/musicbrainz_entity.py index db0c602d..a097077a 100644 --- a/soweego/importer/models/musicbrainz_entity.py +++ b/soweego/importer/models/musicbrainz_entity.py @@ -28,9 +28,7 @@ RELEASE_GROUP_LINK_TABLE = 'musicbrainz_release_group_link' ARTIST_BAND_RELATIONSHIP_TABLE = 'musicbrainz_artist_band_relationship' -RELEASE_ARTIST_RELATIONSHIP_TABLE = ( - 'musicbrainz_release_group_artist_relationship' -) +RELEASE_ARTIST_RELATIONSHIP_TABLE = 'musicbrainz_release_group_artist_relationship' class MusicBrainzArtistEntity(BaseEntity): diff --git a/soweego/importer/musicbrainz_dump_extractor.py b/soweego/importer/musicbrainz_dump_extractor.py index 8846460a..e7f73429 100644 --- a/soweego/importer/musicbrainz_dump_extractor.py +++ b/soweego/importer/musicbrainz_dump_extractor.py @@ -73,13 +73,9 @@ def extract_and_populate(self, dump_file_paths: List[str], resolve: bool): if not os.path.isdir(dump_path): with tarfile.open(dump_file_path, "r:bz2") as tar: - LOGGER.info( - "Extracting dump %s in %s", dump_file_path, dump_path - ) + LOGGER.info("Extracting dump %s in %s", dump_file_path, dump_path) tar.extractall(dump_path) - LOGGER.info( - "Extracted dump %s in %s", dump_file_path, dump_path - ) + LOGGER.info("Extracted dump %s in %s", dump_file_path, dump_path) db_manager = DBManager() @@ -190,9 +186,7 @@ def release_artist_relationships_uniqueness_filter(): def artist_band_relationships_uniqueness_filter(): yield from [ MusicBrainzArtistBandRelationship(item[0], item[1]) - for item in set( - self._artist_band_relationship_generator(dump_path) - ) + for item in set(self._artist_band_relationship_generator(dump_path)) ] relationships_count = self._add_entities_from_generator( @@ -270,9 +264,7 @@ def _add_entities_from_generator( return n_total_entities, n_added_entities @staticmethod - def _get_urls_for_entity_id( - dump_path: str, l_path: str, resolve: bool - ) -> dict: + def _get_urls_for_entity_id(dump_path: str, l_path: str, resolve: bool) -> dict: """given a l_{something}_url relationship file, return a dict of somethingid-[urls]""" @@ -296,9 +288,7 @@ def _get_urls_for_entity_id( relationship[3], ) else: - urlid_entityid_relationship[relationship[3]] = relationship[ - 2 - ] + urlid_entityid_relationship[relationship[3]] = relationship[2] url_path = os.path.join(dump_path, 'mbdump', 'url') url_entityid = {} @@ -312,9 +302,7 @@ def _get_urls_for_entity_id( tsvfile, delimiter='\t', fieldnames=[i for i in range(0, 5)] ) - for url_record in tqdm( - urls, total=count_num_lines_in_file(tsvfile) - ): + for url_record in tqdm(urls, total=count_num_lines_in_file(tsvfile)): urlid = url_record[0] if urlid in urlid_entityid_relationship: @@ -323,9 +311,7 @@ def _get_urls_for_entity_id( continue if resolve and not url_utils.resolve(candidate_url): continue - url_entityid[ - candidate_url - ] = urlid_entityid_relationship[urlid] + url_entityid[candidate_url] = urlid_entityid_relationship[urlid] del urlid_entityid_relationship[urlid] entityid_url = defaultdict(list) @@ -373,15 +359,11 @@ def _artist_link_generator(self, dump_path: str, resolve: bool): for link in artistid_url[artist['id']]: if self._check_person(artist['type_id']): current_entity = MusicBrainzArtistLinkEntity() - self._fill_link_entity( - current_entity, artist['gid'], link - ) + self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity if self._check_band(artist['type_id']): current_entity = MusicBrainzBandLinkEntity() - self._fill_link_entity( - current_entity, artist['gid'], link - ) + self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity def _release_group_link_generator(self, dump_path: str, resolve: bool): @@ -434,9 +416,7 @@ def _isni_link_generator(self, dump_path: str, resolve: bool): for candidate_url in url_utils.clean(link): if not url_utils.validate(candidate_url): continue - if resolve and not url_utils.resolve( - candidate_url - ): + if resolve and not url_utils.resolve(candidate_url): continue artist_link[artistid] = candidate_url done = True @@ -470,15 +450,11 @@ def _isni_link_generator(self, dump_path: str, resolve: bool): link = artist_link[artist['id']] if self._check_person(artist['type_id']): current_entity = MusicBrainzArtistLinkEntity() - self._fill_link_entity( - current_entity, artist['gid'], link - ) + self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity if self._check_band(artist['type_id']): current_entity = MusicBrainzBandLinkEntity() - self._fill_link_entity( - current_entity, artist['gid'], link - ) + self._fill_link_entity(current_entity, artist['gid'], link) yield current_entity except KeyError: continue @@ -549,9 +525,7 @@ def _artist_generator(self, dump_path): try: self._fill_entity(current_entity, artist, areas) - current_entity.gender = self._artist_gender( - artist['gender'] - ) + current_entity.gender = self._artist_gender(artist['gender']) except KeyError: LOGGER.error('Wrong gender code: %s', artist) continue @@ -603,9 +577,7 @@ def _artist_band_relationship_generator(dump_path): if row['link_type'] in link_types: links.add(row['id']) - artists_relationship_file = os.path.join( - dump_path, 'mbdump', 'l_artist_artist' - ) + artists_relationship_file = os.path.join(dump_path, 'mbdump', 'l_artist_artist') ids_translator = {} relationships = [] @@ -670,9 +642,7 @@ def _release_group_generator(self, dump_path): fieldnames=['id', 'gid', 'label', 'artist_credit', 'type_id'], ) - for row in tqdm( - release_reader, total=count_num_lines_in_file(releasefile) - ): + for row in tqdm(release_reader, total=count_num_lines_in_file(releasefile)): entity = MusicBrainzReleaseGroupEntity() self._fill_entity(entity, row, None) if row['id'] in release_group_datesprec: @@ -712,9 +682,7 @@ def _release_group_artist_relationship_generator(dump_path): n_rows = count_num_lines_in_file(artistcreditfile) for row in tqdm(artist_credit_reader, total=n_rows): - artist_id_release[row['artist_id']] = artist_credit_release[ - row['id'] - ] + artist_id_release[row['artist_id']] = artist_credit_release[row['id']] # memory free up for performance del artist_credit_release[row['id']] @@ -861,9 +829,7 @@ def _artist_gender(gender_code): def _retrieve_release_group_dates(self, dump_path): release_dateprec = defaultdict(lambda: (date.today(), 0)) - release_country_path = os.path.join( - dump_path, 'mbdump', 'release_country' - ) + release_country_path = os.path.join(dump_path, 'mbdump', 'release_country') with open(release_country_path) as rfile: releases = DictReader( diff --git a/soweego/ingester/mix_n_match_client.py b/soweego/ingester/mix_n_match_client.py index 3ab7eedc..7de82db9 100644 --- a/soweego/ingester/mix_n_match_client.py +++ b/soweego/ingester/mix_n_match_client.py @@ -97,9 +97,7 @@ @click.command() @click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS)) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.argument('confidence_range', type=(float, float)) @click.argument('matches', type=click.Path(exists=True, dir_okay=False)) def cli(catalog, entity, confidence_range, matches): @@ -145,9 +143,7 @@ def add_catalog(catalog: str, entity: str) -> int: session = DBManager(MNM_DB).new_session() try: existing = ( - session.query(mix_n_match.MnMCatalog) - .filter_by(name=name_field) - .first() + session.query(mix_n_match.MnMCatalog).filter_by(name=name_field).first() ) if existing is None: LOGGER.info( @@ -179,9 +175,7 @@ def add_catalog(catalog: str, entity: str) -> int: finally: session.close() - LOGGER.info( - 'Catalog addition/update went fine. Internal ID: %d', catalog_id - ) + LOGGER.info('Catalog addition/update went fine. Internal ID: %d', catalog_id) return catalog_id @@ -297,9 +291,7 @@ def _import_matches( url = '' if url_prefix is None else f'{url_prefix}{tid}' db_entity = mix_n_match.MnMEntry() - _set_entry_fields( - db_entity, catalog_id, qid, tid, url, class_qid, score - ) + _set_entry_fields(db_entity, catalog_id, qid, tid, url, class_qid, score) batch.append(db_entity) if len(batch) >= COMMIT_EVERY: @@ -452,9 +444,7 @@ def _set_catalog_fields(db_entity, name_field, catalog, entity): db_entity.active = 1 db_entity.note = NOTE_FIELD db_entity.type = CATALOG_TYPES.get(catalog, '') - db_entity.source_item = int( - target_database.get_catalog_qid(catalog).lstrip('Q') - ) + db_entity.source_item = int(target_database.get_catalog_qid(catalog).lstrip('Q')) wd_prop = target_database.get_catalog_pid(catalog, entity) db_entity.wd_prop = int(wd_prop.lstrip('P')) db_entity.search_wp = SEARCH_WP_FIELD diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index a96ad6b3..278f5aaf 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -93,9 +93,7 @@ @click.command() @click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS)) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.argument('invalid_identifiers', type=click.File()) @click.option( '-s', @@ -110,9 +108,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox): Format: { catalog_identifier: [ list of QIDs ] } """ if sandbox: - LOGGER.info( - 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2 - ) + LOGGER.info('Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2) delete_or_deprecate_identifiers( 'delete', catalog, entity, json.load(invalid_identifiers), sandbox @@ -121,9 +117,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox): @click.command() @click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS)) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.argument('invalid_identifiers', type=click.File()) @click.option( '-s', @@ -138,9 +132,7 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox): Format: { catalog_identifier: [ list of QIDs ] } """ if sandbox: - LOGGER.info( - 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2 - ) + LOGGER.info('Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2) delete_or_deprecate_identifiers( 'deprecate', catalog, entity, json.load(invalid_identifiers), sandbox @@ -149,9 +141,7 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox): @click.command() @click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS)) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.argument('identifiers', type=click.File()) @click.option( '-s', @@ -356,8 +346,7 @@ def add_people_statements( edit_summary = BIO_VALIDATION_SUMMARY else: raise ValueError( - f"Invalid criterion: '{criterion}'. " - "Please use either 'links' or 'bio'" + f"Invalid criterion: '{criterion}'. " "Please use either 'links' or 'bio'" ) sandbox_item = vocabulary.SANDBOX_2 @@ -387,9 +376,7 @@ def add_people_statements( ) -def add_works_statements( - statements: Iterable, catalog: str, sandbox: bool -) -> None: +def add_works_statements(statements: Iterable, catalog: str, sandbox: bool) -> None: """Add statements to existing Wikidata works. Statements typically come from @@ -459,9 +446,7 @@ def delete_or_deprecate_identifiers( for tid, qids in invalid.items(): for qid in qids: actual_qid = qid if not sandbox else sandbox_item - LOGGER.info( - 'Will %s %s identifier: %s -> %s', action, catalog, tid, qid - ) + LOGGER.info('Will %s %s identifier: %s -> %s', action, catalog, tid, qid) _delete_or_deprecate(action, actual_qid, tid, catalog, catalog_pid) @@ -630,9 +615,7 @@ def _handle_addition( # No given value -> add statement if value not in existing_values: - LOGGER.debug( - '%s has no %s claim with value %s', subject_qid, predicate, value - ) + LOGGER.debug('%s has no %s claim with value %s', subject_qid, predicate, value) _add( subject_item, predicate, @@ -646,9 +629,7 @@ def _handle_addition( return # Claim with the given predicate and value -> add reference - LOGGER.debug( - "%s has a %s claim with value '%s'", subject_qid, predicate, value - ) + LOGGER.debug("%s has a %s claim with value '%s'", subject_qid, predicate, value) if case_insensitive: for claim in given_predicate_claims: if claim.getTarget().lower() == value: @@ -822,9 +803,7 @@ def _add( catalog_id, edit_summary=edit_summary, ) - LOGGER.info( - 'Added (%s, %s, %s) statement', subject_item.getID(), predicate, value - ) + LOGGER.info('Added (%s, %s, %s) statement', subject_item.getID(), predicate, value) def _reference( @@ -862,18 +841,14 @@ def _reference( if catalog_pid is not None and catalog_id is not None: # (catalog property, catalog ID) reference claim - catalog_id_reference = pywikibot.Claim( - REPO, catalog_pid, is_reference=True - ) + catalog_id_reference = pywikibot.Claim(REPO, catalog_pid, is_reference=True) catalog_id_reference.setTarget(catalog_id) reference_node.append(catalog_id_reference) log_buffer.append(f'({catalog_pid}, {catalog_id})') # All tasks # (retrieved, TODAY) reference claim - retrieved_reference = pywikibot.Claim( - REPO, vocabulary.RETRIEVED, is_reference=True - ) + retrieved_reference = pywikibot.Claim(REPO, vocabulary.RETRIEVED, is_reference=True) retrieved_reference.setTarget(TIMESTAMP) reference_node.append(retrieved_reference) log_buffer.append(f'({retrieved_reference.getID()}, {TODAY})') @@ -929,9 +904,7 @@ def _delete_or_deprecate(action, qid, tid, catalog, catalog_pid) -> None: if action == 'delete': item.removeClaims([claim], summary='Invalid identifier') elif action == 'deprecate': - claim.changeRank( - 'deprecated', summary='Deprecate arguable claim' - ) + claim.changeRank('deprecated', summary='Deprecate arguable claim') LOGGER.debug('%s claim: %s', action.title() + 'd', claim.toJSON()) LOGGER.info( '%s %s identifier statement from %s', action.title() + 'd', catalog, qid diff --git a/soweego/linker/baseline.py b/soweego/linker/baseline.py index 0206be45..16893290 100644 --- a/soweego/linker/baseline.py +++ b/soweego/linker/baseline.py @@ -41,12 +41,8 @@ @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option( '-r', '--rule', @@ -86,18 +82,14 @@ def cli(catalog, entity, rule, upload, sandbox, dir_io, dates): Run all of them by default. """ - LOGGER.info( - "Running baseline '%s' rule over %s %s ...", rule, catalog, entity - ) + LOGGER.info("Running baseline '%s' rule over %s %s ...", rule, catalog, entity) # No need for the return value: only the output file will be consumed build_wikidata('classification', catalog, entity, dir_io) _run(catalog, entity, rule, dates, upload, sandbox, dir_io) - LOGGER.info( - "Baseline '%s' rule over %s %s completed", rule, catalog, entity - ) + LOGGER.info("Baseline '%s' rule over %s %s completed", rule, catalog, entity) def _run(catalog, entity, rule, check_dates, upload, sandbox, dir_io): @@ -180,12 +172,8 @@ def _run(catalog, entity, rule, check_dates, upload, sandbox, dir_io): @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option('-u', '--upload', is_flag=True, help='Upload links to Wikidata.') @click.option( '-s', @@ -309,12 +297,8 @@ def _perfect_names_linker( continue if wd_name.lower() == target.name.lower(): - if not compare_dates or _birth_death_date_match( - wd, target - ): - yield wd[ - keys.QID - ], catalog_pid, target.catalog_id + if not compare_dates or _birth_death_date_match(wd, target): + yield wd[keys.QID], catalog_pid, target.catalog_id bucket.clear() bucket_names.clear() @@ -354,9 +338,7 @@ def _similar_tokens_linker( for target in data_gathering.tokens_fulltext_search( target_db_entity, True, wd_tokens ): - if not compare_dates or _birth_death_date_match( - wd_item, target - ): + if not compare_dates or _birth_death_date_match(wd_item, target): yield qid, catalog_pid, target.catalog_id to_exclude.add(target.catalog_id) @@ -370,9 +352,7 @@ def _similar_tokens_linker( ): target_tokens = set(getattr(target, target_field).split()) - if len(target_tokens) > 1 and target_tokens.issubset( - wd_tokens - ): + if len(target_tokens) > 1 and target_tokens.issubset(wd_tokens): if not compare_dates or _birth_death_date_match( wd_item, target ): diff --git a/soweego/linker/blocking.py b/soweego/linker/blocking.py index e1a0e31e..6dabd077 100644 --- a/soweego/linker/blocking.py +++ b/soweego/linker/blocking.py @@ -102,9 +102,7 @@ def find_samples( wikidata_column.dropna(inplace=True) samples = _fire_queries(wikidata_column, target_db_entity) - samples_index = pd.MultiIndex.from_tuples( - samples, names=[keys.QID, keys.TID] - ) + samples_index = pd.MultiIndex.from_tuples(samples, names=[keys.QID, keys.TID]) LOGGER.debug( '%s %s samples index chunk %d random example:\n%s', @@ -151,16 +149,12 @@ def _full_text_search( ), ) ) - LOGGER.debug( - 'Target ID candidates: %s - Query terms: %s', tids, query_terms - ) + LOGGER.debug('Target ID candidates: %s - Query terms: %s', tids, query_terms) return [(qid, tid) for tid in tids] -def _fire_queries( - wikidata_column: pd.Series, target_db_entity: constants.DB_ENTITY -): +def _fire_queries(wikidata_column: pd.Series, target_db_entity: constants.DB_ENTITY): with Pool() as pool: for result in tqdm( pool.imap_unordered( diff --git a/soweego/linker/classifiers.py b/soweego/linker/classifiers.py index 77889305..8710ed14 100644 --- a/soweego/linker/classifiers.py +++ b/soweego/linker/classifiers.py @@ -103,9 +103,7 @@ def _fit( model_path = os.path.join( constants.WORK_DIR, - constants.NEURAL_NETWORK_CHECKPOINT_MODEL.format( - self.__class__.__name__ - ), + constants.NEURAL_NETWORK_CHECKPOINT_MODEL.format(self.__class__.__name__), ) os.makedirs(os.path.dirname(model_path), exist_ok=True) @@ -577,9 +575,7 @@ def __init__(self, num_features, **kwargs): estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: - model = utils.init_model( - clf, num_features=self.num_features, **kwargs - ) + model = utils.init_model(clf, num_features=self.num_features, **kwargs) estimators.append((clf, model.kernel)) @@ -644,9 +640,7 @@ def __init__(self, num_features, **kwargs): def init_estimators(num_features): estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: - model = utils.init_model( - clf, num_features=num_features, **kwargs - ) + model = utils.init_model(clf, num_features=num_features, **kwargs) estimators.append((clf, model.kernel)) return estimators diff --git a/soweego/linker/evaluate.py b/soweego/linker/evaluate.py index 97a518b6..bff6dbe8 100644 --- a/soweego/linker/evaluate.py +++ b/soweego/linker/evaluate.py @@ -34,19 +34,14 @@ context_settings={'ignore_unknown_options': True, 'allow_extra_args': True} ) @click.argument('classifier', type=click.Choice(constants.CLASSIFIERS)) -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option('-k', '--k-folds', default=5, help="Number of folds, default: 5.") @click.option( '-s', '--single', is_flag=True, - help='Compute a single evaluation over all k folds, instead of k ' - 'evaluations.', + help='Compute a single evaluation over all k folds, instead of k ' 'evaluations.', ) @click.option( '-n', @@ -71,9 +66,7 @@ help=f'Input/output directory, default: {constants.WORK_DIR}.', ) @click.pass_context -def cli( - ctx, classifier, catalog, entity, k_folds, single, nested, metric, dir_io -): +def cli(ctx, classifier, catalog, entity, k_folds, single, nested, metric, dir_io): """Evaluate the performance of a supervised linker. By default, run 5-fold cross-validation and @@ -132,9 +125,7 @@ def cli( def _build_output_paths(catalog, entity, classifier, dir_io): classifier = constants.CLASSIFIERS.get(classifier) - performance = constants.LINKER_PERFORMANCE.format( - catalog, entity, classifier - ) + performance = constants.LINKER_PERFORMANCE.format(catalog, entity, classifier) predictions = constants.LINKER_EVALUATION_PREDICTIONS.format( catalog, entity, classifier ) @@ -259,8 +250,7 @@ def _run_nested( dir_io, ): LOGGER.warning( - 'You have opted for the slowest evaluation option, ' - 'please be patient ...' + 'You have opted for the slowest evaluation option, ' 'please be patient ...' ) LOGGER.info( 'Starting nested %d-fold cross-validation with ' @@ -272,9 +262,7 @@ def _run_nested( param_grid = constants.PARAMETER_GRIDS.get(clf) if param_grid is None: - err_msg = ( - f'Hyperparameter tuning for classifier "{clf}" is not supported' - ) + err_msg = f'Hyperparameter tuning for classifier "{clf}" is not supported' LOGGER.critical(err_msg) raise NotImplementedError(err_msg) @@ -301,9 +289,7 @@ def _compute_performance(test_index, predictions, test_vectors_size): recall = rl.recall(test_index, predictions) f_score = rl.fscore(confusion_matrix) - LOGGER.info( - 'Precision: %f - Recall: %f - F-score: %f', precision, recall, f_score - ) + LOGGER.info('Precision: %f - Recall: %f - F-score: %f', precision, recall, f_score) LOGGER.info('Confusion matrix: %s', confusion_matrix) return precision, recall, f_score, confusion_matrix @@ -312,9 +298,7 @@ def _compute_performance(test_index, predictions, test_vectors_size): def _nested_k_fold_with_grid_search( classifier, param_grid, catalog, entity, k, scoring, dir_io, **kwargs ): - dataset, positive_samples_index = train.build_training_set( - catalog, entity, dir_io - ) + dataset, positive_samples_index = train.build_training_set(catalog, entity, dir_io) model = utils.init_model(classifier, dataset.shape[1], **kwargs).kernel inner_k_fold, target = utils.prepare_stratified_k_fold( @@ -372,16 +356,12 @@ def _nested_k_fold_with_grid_search( def _average_k_fold(classifier, catalog, entity, k, dir_io, **kwargs): predictions, precisions, recalls, f_scores = None, [], [], [] - dataset, positive_samples_index = train.build_training_set( - catalog, entity, dir_io - ) + dataset, positive_samples_index = train.build_training_set(catalog, entity, dir_io) k_fold, binary_target_variables = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index ) - for train_index, test_index in k_fold.split( - dataset, binary_target_variables - ): + for train_index, test_index in k_fold.split(dataset, binary_target_variables): training, test = dataset.iloc[train_index], dataset.iloc[test_index] model = utils.init_model(classifier, dataset.shape[1], **kwargs) @@ -418,16 +398,12 @@ def _average_k_fold(classifier, catalog, entity, k, dir_io, **kwargs): def _single_k_fold(classifier, catalog, entity, k, dir_io, **kwargs): predictions, test_set = None, [] - dataset, positive_samples_index = train.build_training_set( - catalog, entity, dir_io - ) + dataset, positive_samples_index = train.build_training_set(catalog, entity, dir_io) k_fold, binary_target_variables = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index ) - for train_index, test_index in k_fold.split( - dataset, binary_target_variables - ): + for train_index, test_index in k_fold.split(dataset, binary_target_variables): training, test = dataset.iloc[train_index], dataset.iloc[test_index] test_set.append(test) diff --git a/soweego/linker/features.py b/soweego/linker/features.py index f3623315..e6ae0ec3 100644 --- a/soweego/linker/features.py +++ b/soweego/linker/features.py @@ -77,9 +77,7 @@ class ExactMatch(BaseCompareFeature): """Compare pairs of lists through exact match on each pair of elements.""" name = 'exact_match' - description = ( - 'Compare pairs of lists through exact match on each pair of elements.' - ) + description = 'Compare pairs of lists through exact match on each pair of elements.' def __init__( self, @@ -112,9 +110,7 @@ def _compute_vectorized(self, source_column, target_column): def exact_apply(pair): if _pair_has_any_null(pair): - LOGGER.debug( - "Can't compare, the pair contains null values: %s", pair - ) + LOGGER.debug("Can't compare, the pair contains null values: %s", pair) return np.nan scores = [] @@ -367,12 +363,8 @@ def check_date_equality(pair: Tuple[List[pd.Period], List[pd.Period]]): for source, target in itertools.product(source_list, target_list): # Get precision number for both dates - s_precision = constants.PD_PERIOD_PRECISIONS.index( - source.freq.name - ) - t_precision = constants.PD_PERIOD_PRECISIONS.index( - target.freq.name - ) + s_precision = constants.PD_PERIOD_PRECISIONS.index(source.freq.name) + t_precision = constants.PD_PERIOD_PRECISIONS.index(target.freq.name) # Minimum pair precision = maximum shared precision lowest_prec = min(s_precision, t_precision) @@ -408,9 +400,7 @@ def check_date_equality(pair: Tuple[List[pd.Period], List[pd.Period]]): return best - return fillna( - concatenated.apply(check_date_equality), self.missing_value - ) + return fillna(concatenated.apply(check_date_equality), self.missing_value) class SharedTokens(BaseCompareFeature): @@ -420,8 +410,7 @@ class SharedTokens(BaseCompareFeature): name = 'shared_tokens' description = ( - 'Compare pairs of lists holding string tokens ' - 'through weighted intersection' + 'Compare pairs of lists holding string tokens ' 'through weighted intersection' ) def __init__( @@ -548,9 +537,7 @@ def _expand_occupations(self, occupation_qids: Set[str]) -> Set[str]: return expanded_set - def _compute_vectorized( - self, source_column: pd.Series, target_column: pd.Series - ): + def _compute_vectorized(self, source_column: pd.Series, target_column: pd.Series): # add the superclasses and subclasses of each occupation to # the target column @@ -563,8 +550,7 @@ def _compute_vectorized( def check_occupation_equality(pair: Tuple[Set[str], Set[str]]): if _pair_has_any_null(pair): LOGGER.debug( - "Can't compare occupations, " - "the pair contains null values: %s", + "Can't compare occupations, " "the pair contains null values: %s", pair, ) return np.nan @@ -576,9 +562,7 @@ def check_occupation_equality(pair: Tuple[Set[str], Set[str]]): return n_shared_items / min_length - return fillna( - concatenated.apply(check_occupation_equality), self.missing_value - ) + return fillna(concatenated.apply(check_occupation_equality), self.missing_value) class SharedTokensPlus(BaseCompareFeature): @@ -596,8 +580,7 @@ class SharedTokensPlus(BaseCompareFeature): name = 'shared_tokens_plus' description = ( - 'Compare pairs of lists holding string tokens ' - 'through weighted intersection' + 'Compare pairs of lists holding string tokens ' 'through weighted intersection' ) def __init__( @@ -652,17 +635,13 @@ def _compute_vectorized( # Compute shared tokens after filtering stop words def compare_apply(pair: Tuple[List[str], List[str]]) -> float: if _pair_has_any_null(pair): - LOGGER.debug( - "Can't compare, the pair contains null values: %s", pair - ) + LOGGER.debug("Can't compare, the pair contains null values: %s", pair) return np.nan # first we clean a bit the pair # make all lowercase and split on possible spaces # also reshape result into a list (flatten) - pair = [ - self._flatten([el.lower().split() for el in p]) for p in pair - ] + pair = [self._flatten([el.lower().split() for el in p]) for p in pair] s_item, t_item = pair diff --git a/soweego/linker/link.py b/soweego/linker/link.py index 4ac80fa2..08d570f1 100644 --- a/soweego/linker/link.py +++ b/soweego/linker/link.py @@ -31,12 +31,8 @@ @click.command() @click.argument('classifier', type=click.Choice(constants.CLASSIFIERS)) -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option( '-t', '--threshold', @@ -63,9 +59,7 @@ default=constants.WORK_DIR, help=f'Input/output directory, default: {constants.WORK_DIR}.', ) -def cli( - classifier, catalog, entity, threshold, name_rule, upload, sandbox, dir_io -): +def cli(classifier, catalog, entity, threshold, name_rule, upload, sandbox, dir_io): """Run a supervised linker. Build the classification set relevant to the given catalog and entity, @@ -82,9 +76,7 @@ def cli( """ actual_classifier = constants.CLASSIFIERS[classifier] - model_path, result_path = _handle_io( - actual_classifier, catalog, entity, dir_io - ) + model_path, result_path = _handle_io(actual_classifier, catalog, entity, dir_io) # Exit if the model file doesn't exist if model_path is None: sys.exit(1) @@ -223,9 +215,7 @@ def _apply_linking_rules(name_rule, predictions, target_chunk, wd_chunk): return predictions -def _get_unique_predictions_above_threshold( - predictions, threshold -) -> pd.DataFrame: +def _get_unique_predictions_above_threshold(predictions, threshold) -> pd.DataFrame: # Filter by threshold above_threshold = predictions[predictions >= threshold] @@ -256,9 +246,7 @@ def _handle_io(classifier, catalog, entity, dir_io): # Delete existing result file, # otherwise the current output would be appended to it if os.path.isfile(result_path): - LOGGER.warning( - "Will delete old output file found at '%s' ...", result_path - ) + LOGGER.warning("Will delete old output file found at '%s' ...", result_path) os.remove(result_path) os.makedirs(os.path.dirname(result_path), exist_ok=True) @@ -269,9 +257,7 @@ def _handle_io(classifier, catalog, entity, dir_io): def _upload(chunk, chunk_number, catalog, entity, sandbox): links = dict(chunk.to_dict().keys()) - LOGGER.info( - 'Starting upload of links to Wikidata, chunk %d ...', chunk_number - ) + LOGGER.info('Starting upload of links to Wikidata, chunk %d ...', chunk_number) wikidata_bot.add_identifiers(links, catalog, entity, sandbox) diff --git a/soweego/linker/train.py b/soweego/linker/train.py index e430802b..e88ed24b 100644 --- a/soweego/linker/train.py +++ b/soweego/linker/train.py @@ -33,12 +33,8 @@ context_settings={'ignore_unknown_options': True, 'allow_extra_args': True} ) @click.argument('classifier', type=click.Choice(constants.CLASSIFIERS)) -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option( '-t', '--tune', @@ -72,9 +68,7 @@ def cli(ctx, classifier, catalog, entity, tune, k_folds, dir_io): actual_classifier = constants.CLASSIFIERS[classifier] - model = execute( - actual_classifier, catalog, entity, tune, k_folds, dir_io, **kwargs - ) + model = execute(actual_classifier, catalog, entity, tune, k_folds, dir_io, **kwargs) outfile = os.path.join( dir_io, diff --git a/soweego/linker/workflow.py b/soweego/linker/workflow.py index 15ef8879..e39b3de1 100644 --- a/soweego/linker/workflow.py +++ b/soweego/linker/workflow.py @@ -49,9 +49,7 @@ LOGGER = logging.getLogger(__name__) -def build_wikidata( - goal: str, catalog: str, entity: str, dir_io: str -) -> JsonReader: +def build_wikidata(goal: str, catalog: str, entity: str, dir_io: str) -> JsonReader: """Build a Wikidata dataset for training or classification purposes: workflow step 1. @@ -122,9 +120,7 @@ def build_wikidata( # Cached dataset, for development purposes else: - LOGGER.info( - "Will reuse existing Wikidata %s set: '%s'", goal, wd_io_path - ) + LOGGER.info("Will reuse existing Wikidata %s set: '%s'", goal, wd_io_path) if goal == 'training': _reconstruct_qids_and_tids(wd_io_path, qids_and_tids) @@ -178,9 +174,7 @@ def build_target( for table in tables: query = query.outerjoin(table, base.catalog_id == table.catalog_id) # Condition - query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads( - False - ) + query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads(False) sql = query.statement LOGGER.debug('SQL query to be fired: %s', sql) @@ -222,9 +216,7 @@ def preprocess_wikidata( for i, chunk in enumerate(wikidata_reader, 1): # 1. QID as index chunk.set_index(keys.QID, inplace=True) - log_dataframe_info( - LOGGER, chunk, f"Built index from '{keys.QID}' column" - ) + log_dataframe_info(LOGGER, chunk, f"Built index from '{keys.QID}' column") # 2. Drop columns with null values only _drop_null_columns(chunk) @@ -267,9 +259,7 @@ def preprocess_wikidata( yield chunk -def preprocess_target( - goal: str, target_reader: Iterator[pd.DataFrame] -) -> pd.DataFrame: +def preprocess_target(goal: str, target_reader: Iterator[pd.DataFrame]) -> pd.DataFrame: """Preprocess a target catalog dataset: workflow step 2. This function consumes :class:`pandas.DataFrame` chunks and @@ -386,9 +376,7 @@ def in_both_datasets(col: str) -> bool: name_column = keys.NAME if in_both_datasets(name_column): feature_extractor.add( - features.ExactMatch( - name_column, name_column, label=f'{name_column}_exact' - ) + features.ExactMatch(name_column, name_column, label=f'{name_column}_exact') ) # URL features @@ -437,9 +425,7 @@ def in_both_datasets(col: str) -> bool: ) ) - feature_vectors = feature_extractor.compute( - candidate_pairs, wikidata, target - ) + feature_vectors = feature_extractor.compute(candidate_pairs, wikidata, target) feature_vectors = feature_vectors[ ~feature_vectors.index.duplicated() # Drop duplicates ] @@ -569,9 +555,7 @@ def _rename_or_drop_tid_columns(target): # in this case, they must be identical, # so take the first one target[keys.TID] = ( - no_nulls.iloc[:, 0] - if isinstance(no_nulls, pd.DataFrame) - else no_nulls + no_nulls.iloc[:, 0] if isinstance(no_nulls, pd.DataFrame) else no_nulls ) target.drop(columns=keys.CATALOG_ID, inplace=True) @@ -738,9 +722,7 @@ def _build_date_object(value, slice_index, to_dates_list): try: to_dates_list.append(pd.Period(value[:slice_index])) except ValueError as ve: - LOGGER.warning( - "Skipping date that can't be parsed: %s. Reason: %s", value, ve - ) + LOGGER.warning("Skipping date that can't be parsed: %s. Reason: %s", value, ve) def _occupations_to_set(df): diff --git a/soweego/pipeline.py b/soweego/pipeline.py index 03e5076c..2619e330 100644 --- a/soweego/pipeline.py +++ b/soweego/pipeline.py @@ -15,9 +15,7 @@ @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) @click.option( '--validator/--no-validator', default=False, @@ -38,9 +36,7 @@ default=True, help='Upload results to Wikidata. Default: yes.', ) -def cli( - catalog: str, validator: bool, importer: bool, linker: bool, upload: bool -): +def cli(catalog: str, validator: bool, importer: bool, linker: bool, upload: bool): """Launch the whole pipeline.""" if importer: @@ -61,9 +57,7 @@ def cli( def _importer(target: str): """Contains all the command the importer has to do""" - LOGGER.info( - "Running importer for target: %s without resolving the URLs", target - ) + LOGGER.info("Running importer for target: %s without resolving the URLs", target) _invoke_no_exit(import_cli, [target]) @@ -75,9 +69,7 @@ def _linker(target: str, upload: bool): if not target_type: continue arguments = ( - [target, target_type, '--upload'] - if upload - else [target, target_type] + [target, target_type, '--upload'] if upload else [target, target_type] ) _invoke_no_exit(baseline.extract_cli, arguments) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 86c45b1b..fb3a8f59 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -21,13 +21,7 @@ import click from sqlalchemy.exc import SQLAlchemyError -from soweego.commons import ( - constants, - data_gathering, - keys, - target_database, - text_utils, -) +from soweego.commons import constants, data_gathering, keys, target_database, text_utils from soweego.commons.db_manager import DBManager from soweego.ingester import wikidata_bot from soweego.wikidata import api_requests, vocabulary @@ -39,9 +33,7 @@ # For all CLIs WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl' # For `links_cli` and `bio_cli` -IDS_TO_BE_DEPRECATED_FNAME = ( - '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json' -) +IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json' SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv' WD_STATEMENTS_FNAME = 'wikidata_{criterion}_for_{catalog}_{entity}.csv' # For `dead_ids_cli` @@ -50,9 +42,7 @@ EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv' URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv' # For `bio_cli` -BIO_STATEMENTS_TO_BE_ADDED_FNAME = ( - '{catalog}_{entity}_bio_statements_to_be_added.csv' -) +BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv' # URL prefixes for catalog providers QID_PREFIX = 'https://www.wikidata.org/wiki/' @@ -60,12 +50,8 @@ @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option( '-d', '--deprecate', @@ -102,9 +88,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): ) wd_cache_path = os.path.join( dir_io, - WD_CACHE_FNAME.format( - catalog=catalog, entity=entity, criterion='dead_ids' - ), + WD_CACHE_FNAME.format(catalog=catalog, entity=entity, criterion='dead_ids'), ) # Handle Wikidata cache @@ -149,21 +133,15 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option( '-b', '--blacklist', is_flag=True, help='Filter low-quality URLs through a blacklist.', ) -@click.option( - '-u', '--upload', is_flag=True, help='Upload the output to Wikidata.' -) +@click.option('-u', '--upload', is_flag=True, help='Upload the output to Wikidata.') @click.option( '-s', '--sandbox', @@ -181,9 +159,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): default=constants.WORK_DIR, help=f'Input/output directory, default: {constants.WORK_DIR}.', ) -def links_cli( - catalog, entity, blacklist, upload, sandbox, dump_wikidata, dir_io -): +def links_cli(catalog, entity, blacklist, upload, sandbox, dump_wikidata, dir_io): """Validate identifiers against links. Dump 6 output files: @@ -235,15 +211,11 @@ def links_cli( ) wd_urls_path = os.path.join( dir_io, - WD_STATEMENTS_FNAME.format( - criterion=criterion, catalog=catalog, entity=entity - ), + WD_STATEMENTS_FNAME.format(criterion=criterion, catalog=catalog, entity=entity), ) wd_cache_path = os.path.join( dir_io, - WD_CACHE_FNAME.format( - catalog=catalog, entity=entity, criterion=criterion - ), + WD_CACHE_FNAME.format(catalog=catalog, entity=entity, criterion=criterion), ) # Wikidata cache @@ -272,17 +244,13 @@ def links_cli( ) = result # Dump output files _dump_deprecated(deprecate, deprecate_path) - _dump_csv_output( - add_ext_ids, add_ext_ids_path, 'third-party IDs to be added' - ) + _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added') _dump_csv_output(add_urls, add_urls_path, 'URLs to be added') _dump_csv_output( ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced' ) _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') - _dump_csv_output( - wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}' - ) + _dump_csv_output(wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}') # Dump Wikidata cache if dump_wikidata: @@ -291,9 +259,7 @@ def links_cli( # Using the highest protocol available for the current Python # version should be the most efficient solution pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) - LOGGER.info( - 'URLs gathered from Wikidata dumped to %s', wd_cache_path - ) + LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path) except MemoryError: LOGGER.warning('Could not pickle the Wikidata cache: memory error') @@ -309,35 +275,19 @@ def links_cli( 'deprecate', catalog, entity, deprecate, sandbox ) LOGGER.info('Starting addition of external IDs to Wikidata ...') - wikidata_bot.add_people_statements( - catalog, add_ext_ids, criterion, sandbox - ) + wikidata_bot.add_people_statements(catalog, add_ext_ids, criterion, sandbox) LOGGER.info('Starting addition of URLs to Wikidata ...') - wikidata_bot.add_people_statements( - catalog, add_urls, criterion, sandbox - ) - LOGGER.info( - 'Starting referencing of shared external IDs in Wikidata ...' - ) - wikidata_bot.add_people_statements( - catalog, add_ext_ids, criterion, sandbox - ) + wikidata_bot.add_people_statements(catalog, add_urls, criterion, sandbox) + LOGGER.info('Starting referencing of shared external IDs in Wikidata ...') + wikidata_bot.add_people_statements(catalog, add_ext_ids, criterion, sandbox) LOGGER.info('Starting referencing of shared URLs in Wikidata ...') - wikidata_bot.add_people_statements( - catalog, add_urls, criterion, sandbox - ) + wikidata_bot.add_people_statements(catalog, add_urls, criterion, sandbox) @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) -@click.option( - '-u', '--upload', is_flag=True, help='Upload the output to Wikidata.' -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) +@click.option('-u', '--upload', is_flag=True, help='Upload the output to Wikidata.') @click.option( '-s', '--sandbox', @@ -396,15 +346,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): ) wd_stmts_path = os.path.join( dir_io, - WD_STATEMENTS_FNAME.format( - criterion=criterion, catalog=catalog, entity=entity - ), + WD_STATEMENTS_FNAME.format(criterion=criterion, catalog=catalog, entity=entity), ) wd_cache_path = os.path.join( dir_io, - WD_CACHE_FNAME.format( - catalog=catalog, entity=entity, criterion=criterion - ), + WD_CACHE_FNAME.format(catalog=catalog, entity=entity, criterion=criterion), ) # Wikidata cache @@ -462,14 +408,10 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): LOGGER.info('Starting addition of extra statements to Wikidata ...') wikidata_bot.add_people_statements(catalog, add, criterion, sandbox) LOGGER.info('Starting referencing of shared statements in Wikidata ...') - wikidata_bot.add_people_statements( - catalog, reference, criterion, sandbox - ) + wikidata_bot.add_people_statements(catalog, reference, criterion, sandbox) -def dead_ids( - catalog: str, entity: str, wd_cache=None -) -> Tuple[DefaultDict, Dict]: +def dead_ids(catalog: str, entity: str, wd_cache=None) -> Tuple[DefaultDict, Dict]: """Look for dead identifiers in Wikidata. An identifier is dead if it does not exist in the given catalog when this function is executed. @@ -520,9 +462,7 @@ def dead_ids( .count() ) if existing == 0: - LOGGER.debug( - '%s %s identifier %s is dead', qid, catalog, tid - ) + LOGGER.debug('%s %s identifier %s is dead', qid, catalog, tid) dead[tid].add(qid) session.commit() except SQLAlchemyError as error: @@ -609,16 +549,12 @@ def links( target_database.get_catalog_pid(catalog, entity), wd_links, ) - data_gathering.gather_wikidata_links( - wd_links, url_pids, ext_id_pids_to_urls - ) + data_gathering.gather_wikidata_links(wd_links, url_pids, ext_id_pids_to_urls) else: wd_links = wd_cache # Validation - _validate( - keys.LINKS, wd_links, target_links, deprecate, add, reference, wd_only - ) + _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference, wd_only) # URLs to be added: # 1. Separate external IDs from URLs @@ -737,9 +673,7 @@ def bio( wd_bio = wd_cache # Validation - _validate( - keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only - ) + _validate(keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only) return ( deprecate, @@ -889,8 +823,7 @@ def _compute_comparison_sets(criterion, wd_data, target_data): # In `target_data` we look for relevant date PIDs target_dates = set( filter( - lambda x: x[0] - in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH), + lambda x: x[0] in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH), target_data, ) ) @@ -1004,9 +937,7 @@ def _compare_dates(inputs): extra.add(extra_date) -def _match_dates_by_precision( - precision, wd_elem, wd_timestamp, t_elem, t_timestamp -): +def _match_dates_by_precision(precision, wd_elem, wd_timestamp, t_elem, t_timestamp): slice_indices = { vocabulary.YEAR: 4, vocabulary.MONTH: 7, diff --git a/soweego/validator/enrichment.py b/soweego/validator/enrichment.py index 5d33f73e..5c1b96f5 100644 --- a/soweego/validator/enrichment.py +++ b/soweego/validator/enrichment.py @@ -21,13 +21,7 @@ from sqlalchemy.exc import SQLAlchemyError from tqdm import tqdm -from soweego.commons import ( - constants, - data_gathering, - keys, - target_database, - utils, -) +from soweego.commons import constants, data_gathering, keys, target_database, utils from soweego.commons.db_manager import DBManager from soweego.ingester import wikidata_bot from soweego.wikidata import vocabulary @@ -36,15 +30,9 @@ @click.command() -@click.argument( - 'catalog', type=click.Choice(target_database.supported_targets()) -) -@click.argument( - 'entity', type=click.Choice(target_database.supported_entities()) -) -@click.option( - '-u', '--upload', is_flag=True, help='Upload statements to Wikidata.' -) +@click.argument('catalog', type=click.Choice(target_database.supported_targets())) +@click.argument('entity', type=click.Choice(target_database.supported_entities())) +@click.option('-u', '--upload', is_flag=True, help='Upload statements to Wikidata.') @click.option( '-s', '--sandbox', @@ -74,9 +62,7 @@ def works_people_cli(catalog, entity, upload, sandbox, dir_io): sys.exit(1) with open( - os.path.join( - dir_io, constants.WORKS_BY_PEOPLE_STATEMENTS % (catalog, entity) - ), + os.path.join(dir_io, constants.WORKS_BY_PEOPLE_STATEMENTS % (catalog, entity)), 'w', 1, ) as fout: diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py index 9f9362a0..06214d08 100644 --- a/soweego/wikidata/api_requests.py +++ b/soweego/wikidata/api_requests.py @@ -63,15 +63,11 @@ def resolve_qid(term: str, language='en') -> Optional[str]: return response_body['search'][0]['id'] # Malformed JSON response except KeyError as e: - LOGGER.error( - "Missing '%s' key from JSON response: %s", e, response_body - ) + LOGGER.error("Missing '%s' key from JSON response: %s", e, response_body) return None # No search results except IndexError: - LOGGER.info( - "No QIDs found for search term '%s' (language: %s)", term, language - ) + LOGGER.info("No QIDs found for search term '%s' (language: %s)", term, language) return None @@ -97,9 +93,7 @@ def get_url_blacklist() -> Optional[set]: try: star = response_body['parse']['text']['*'] # Interesting nonsense key except KeyError as e: - LOGGER.error( - "Missing '%s' key from JSON response: %s", e, response_body - ) + LOGGER.error("Missing '%s' key from JSON response: %s", e, response_body) return None # The parsed page should be a
element @@ -181,9 +175,7 @@ def get_links( claims = entity.get('claims') if claims: # Third-party links - yield _yield_expected_values( - qid, claims, url_pids, no_links_count - ) + yield _yield_expected_values(qid, claims, url_pids, no_links_count) # External ID links yield _yield_ext_id_links( @@ -308,9 +300,7 @@ def build_session() -> requests.Session: :rtype: :py:class:`requests.Session` :return: the HTTP session to interact with the Wikidata API """ - session_dump_path = os.path.join( - constants.WORK_DIR, constants.WIKIDATA_API_SESSION - ) + session_dump_path = os.path.join(constants.WORK_DIR, constants.WIKIDATA_API_SESSION) try: return _load_cached_session(session_dump_path) @@ -360,9 +350,7 @@ def build_session() -> requests.Session: return session -def parse_value( - value: Union[str, Dict] -) -> Union[str, Tuple[str, str], Set[str], None]: +def parse_value(value: Union[str, Dict]) -> Union[str, Tuple[str, str], Set[str], None]: """Parse a value returned by the Wikidata API into standard Python objects. The parser supports the following Wikidata @@ -439,8 +427,7 @@ def _lookup_label(item_value): entity = entities.get(item_value) if entity is None: LOGGER.warning( - "Skipping unexpected JSON response with no %s " - "in the 'entities' key", + "Skipping unexpected JSON response with no %s " "in the 'entities' key", item_value, ) return None @@ -544,9 +531,7 @@ def _process_bucket( processed[keys.URL] = list(processed[keys.URL]) # Expected claims - processed.update( - _return_claims_for_linker(qid, claims, needs, counters) - ) + processed.update(_return_claims_for_linker(qid, claims, needs, counters)) result.append(processed) @@ -612,9 +597,7 @@ def _return_third_party_urls(qid, claims, url_pids, counters): available = url_pids.intersection(claims.keys()) if available: - LOGGER.debug( - 'Available third-party URL PIDs for %s: %s', qid, available - ) + LOGGER.debug('Available third-party URL PIDs for %s: %s', qid, available) for pid in available: for pid_claim in claims[pid]: value = _extract_value_from_claim(pid_claim, pid, qid) @@ -747,9 +730,7 @@ def _yield_sitelinks(entity, qid, no_sitelinks_count): def _yield_ext_id_links(ext_id_pids_to_urls, claims, qid, no_ext_ids_count): - available_ext_id_pids = set(ext_id_pids_to_urls.keys()).intersection( - claims.keys() - ) + available_ext_id_pids = set(ext_id_pids_to_urls.keys()).intersection(claims.keys()) if not available_ext_id_pids: LOGGER.debug('No external identifier links for %s', qid) @@ -771,9 +752,7 @@ def _yield_ext_id_links(ext_id_pids_to_urls, claims, qid, no_ext_ids_count): yield qid, formatter_url.replace('$1', ext_id) -def _yield_expected_values( - qid, claims, expected_pids, count, include_pid=False -): +def _yield_expected_values(qid, claims, expected_pids, count, include_pid=False): available = expected_pids.intersection(claims.keys()) if not available: @@ -947,16 +926,12 @@ def _extract_value_from_claim(pid_claim, pid, qid): LOGGER.debug('Processing (%s, %s) claim: %s', qid, pid, pid_claim) main_snak = pid_claim.get('mainsnak') if not main_snak: - LOGGER.warning( - 'Skipping malformed (%s, %s) claim with no main snak', qid, pid - ) + LOGGER.warning('Skipping malformed (%s, %s) claim with no main snak', qid, pid) LOGGER.debug('Malformed claim: %s', pid_claim) return None snak_type = main_snak.get('snaktype') if not snak_type: - LOGGER.warning( - 'Skipping malformed (%s, %s) claim with no snak type', qid, pid - ) + LOGGER.warning('Skipping malformed (%s, %s) claim with no snak type', qid, pid) LOGGER.debug('Malformed claim: %s', pid_claim) return None if snak_type == 'novalue': @@ -976,9 +951,7 @@ def _extract_value_from_claim(pid_claim, pid, qid): return None value = data_value.get('value') if not value: - LOGGER.warning( - 'Skipping malformed (%s, %s) claim with no value', qid, pid - ) + LOGGER.warning('Skipping malformed (%s, %s) claim with no value', qid, pid) LOGGER.debug('Malformed claim: %s', pid_claim) return None LOGGER.debug('QID: %s - PID: %s - Value: %s', qid, pid, value) diff --git a/soweego/wikidata/sparql_queries.py b/soweego/wikidata/sparql_queries.py index 5a5382cd..c369de83 100755 --- a/soweego/wikidata/sparql_queries.py +++ b/soweego/wikidata/sparql_queries.py @@ -142,8 +142,7 @@ def external_id_pids_and_urls() -> Iterator[Dict]: formatter_url_dict = result.get(FORMATTER_URL_BINDING.lstrip('?')) if not formatter_url_dict: LOGGER.warning( - 'Skipping malformed query result: ' - 'no formatter URL binding in %s', + 'Skipping malformed query result: ' 'no formatter URL binding in %s', result, ) continue @@ -252,11 +251,9 @@ def run_query( # Items & identifiers if what == keys.IDENTIFIER: query = ( - IDENTIFIER_TEMPLATE - % (vocabulary.INSTANCE_OF, class_qid, catalog_pid) + IDENTIFIER_TEMPLATE % (vocabulary.INSTANCE_OF, class_qid, catalog_pid) if how == keys.CLASS_QUERY - else IDENTIFIER_TEMPLATE - % (vocabulary.OCCUPATION, class_qid, catalog_pid) + else IDENTIFIER_TEMPLATE % (vocabulary.OCCUPATION, class_qid, catalog_pid) ) return _parse_query_result( keys.IDENTIFIER, _run_paged_query(result_per_page, query) @@ -267,12 +264,9 @@ def run_query( query = ( LINKS_TEMPLATE % (vocabulary.INSTANCE_OF, class_qid, catalog_pid) if how == keys.CLASS_QUERY - else LINKS_TEMPLATE - % (vocabulary.OCCUPATION, class_qid, catalog_pid) - ) - return _parse_query_result( - keys.LINKS, _run_paged_query(result_per_page, query) + else LINKS_TEMPLATE % (vocabulary.OCCUPATION, class_qid, catalog_pid) ) + return _parse_query_result(keys.LINKS, _run_paged_query(result_per_page, query)) # Items without identifiers (for classification purposes) if what == keys.DATASET: @@ -437,8 +431,7 @@ def _make_request(query, response_format=DEFAULT_RESPONSE_FORMAT): # Random value between 0 and 1 wait_time = random() LOGGER.warning( - 'Exceeded concurrent queries limit, ' - 'will retry after %f seconds ...', + 'Exceeded concurrent queries limit, ' 'will retry after %f seconds ...', wait_time, ) @@ -521,9 +514,7 @@ def _run_paged_query(result_per_page, query): result_set = _make_request(' '.join(query_builder)) if not result_set: - LOGGER.error( - 'Skipping page %d because the query went wrong', pages - ) + LOGGER.error('Skipping page %d because the query went wrong', pages) pages += 1 continue