From 135e9211d98650421d120678df470e7fe7122a2d Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Fri, 13 Aug 2021 10:28:37 +0000 Subject: [PATCH 1/9] [WIP] start work on #418 --- soweego/validator/checks.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index ff967438..0e544445 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -10,6 +10,7 @@ __copyright__ = 'Copyleft 2021, Hjfocs' import csv +import gzip import json import logging import os @@ -38,8 +39,10 @@ # For `links_cli` EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv' URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv' +WD_URLS_FNAME = 'wikidata_urls_for_{catalog}_{entity}.txt.gz' # For `bio_cli` BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv' +WD_STATEMENTS_FNAME = 'wikidata_statements_for_{catalog}_{entity}.csv.gz' @click.command() @@ -168,7 +171,7 @@ def links_cli( ): """Validate identifiers against links. - Dump 5 output files: + Dump 6 output files: 1. catalog IDs to be deprecated. JSON format: {catalog_ID: [list of QIDs]} @@ -183,6 +186,9 @@ def links_cli( 5. URLs to be referenced. Same format as file #3 + 6. URLs found in Wikidata but not in the target catalog. + GZIP text format, one URL per line + You can pass the '-u' flag to upload the output to Wikidata. The '-b' flag applies a URL blacklist of low-quality Web domains to file #3. @@ -214,6 +220,11 @@ def links_cli( catalog=catalog, entity=entity, task='referenced' ) ) + wd_urls_path = os.path.join( + dir_io, WD_URLS_FNAME.format( + catalog=catalog, entity=entity + ) + ) wd_cache_path = os.path.join( dir_io, WD_CACHE_FNAME.format( catalog=catalog, entity=entity, criterion=criterion @@ -230,6 +241,7 @@ def links_cli( catalog, entity, blacklist, wd_cache=wd_cache ) else: + # FIXME add `wd_urls` arg deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links( catalog, entity, blacklist ) @@ -244,6 +256,8 @@ def links_cli( _dump_csv_output(add_urls, add_urls_path, 'URLs to be added') _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced') _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') + with gzip.open(wd_urls_path, 'wt') as gzout: + gzout.writelines([url + '\n' for url in wd_urls]) # Dump Wikidata cache if dump_wikidata: From ef5def00625be380402296c5a0a32be563fcaddd Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Mon, 16 Aug 2021 15:11:57 +0200 Subject: [PATCH 2/9] dump Wikidata values not in the target catalog, closes #418 --- soweego/validator/checks.py | 191 +++++++++++++++++++++++------------- 1 file changed, 121 insertions(+), 70 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 0e544445..4425079a 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -10,7 +10,6 @@ __copyright__ = 'Copyleft 2021, Hjfocs' import csv -import gzip import json import logging import os @@ -173,21 +172,23 @@ def links_cli( Dump 6 output files: - 1. catalog IDs to be deprecated. JSON format: - {catalog_ID: [list of QIDs]} + 1. catalog IDs to be deprecated. + JSON format: {catalog_ID: [list of QIDs]} - 2. third-party IDs to be added. CSV format: - QID,third-party_PID,third-party_ID,catalog_ID + 2. third-party IDs to be added. + CSV format: QID,third-party_PID,third-party_ID,catalog_ID - 3. URLs to be added. CSV format: - QID,P2888,URL,catalog_ID + 3. URLs to be added. + CSV format: QID,P2888,URL,catalog_ID - 4. third-party IDs to be referenced. Same format as file #2 + 4. third-party IDs to be referenced. + Same format as file #2 - 5. URLs to be referenced. Same format as file #3 + 5. URLs to be referenced. + Same format as file #3 6. URLs found in Wikidata but not in the target catalog. - GZIP text format, one URL per line + CSV format: URL,QID You can pass the '-u' flag to upload the output to Wikidata. @@ -237,12 +238,11 @@ def links_cli( wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache - deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, _ = links( + deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, _ = links( catalog, entity, blacklist, wd_cache=wd_cache ) else: - # FIXME add `wd_urls` arg - deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links( + deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = links( catalog, entity, blacklist ) @@ -256,8 +256,7 @@ def links_cli( _dump_csv_output(add_urls, add_urls_path, 'URLs to be added') _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced') _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') - with gzip.open(wd_urls_path, 'wt') as gzout: - gzout.writelines([url + '\n' for url in wd_urls]) + _dump_csv_output(wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}') # Dump Wikidata cache if dump_wikidata: @@ -333,15 +332,19 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): Look for birth/death dates, birth/death places, gender. - Dump 3 output files: + Dump 4 output files: - 1. catalog IDs to be deprecated. JSON format: - {catalog_ID: [list of QIDs]} + 1. catalog IDs to be deprecated. + JSON format: {catalog_ID: [list of QIDs]} - 2. statements to be added. CSV format: - QID,PID,value,catalog_ID + 2. statements to be added. + CSV format: QID,PID,value,catalog_ID - 3. shared statements to be referenced. Same format as file #2 + 3. shared statements to be referenced. + Same format as file #2 + + 4. statements found in Wikidata but not in the target catalog. + CSV format: catalog_ID,PID,value,QID You can pass the '-u' flag to upload the output to Wikidata. """ @@ -361,6 +364,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): catalog=catalog, entity=entity, criterion=criterion ) ) + wd_stmts_path = os.path.join( + dir_io, WD_STATEMENTS_FNAME.format( + catalog=catalog, entity=entity + ) + ) wd_cache_path = os.path.join( dir_io, WD_CACHE_FNAME.format( catalog=catalog, entity=entity, criterion=criterion @@ -373,9 +381,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache - deprecate, add, reference, _ = bio(catalog, entity, wd_cache=wd_cache) + deprecate, add, reference, wd_stmts, _ = bio(catalog, entity, wd_cache=wd_cache) else: - deprecate, add, reference, wd_cache = bio(catalog, entity) + deprecate, add, reference, wd_stmts, wd_cache = bio(catalog, entity) # Nothing to do: the catalog doesn't contain biographical data if deprecate is None: @@ -385,6 +393,10 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): _dump_deprecated(deprecate, deprecate_path) _dump_csv_output(add, add_path, 'statements to be added') _dump_csv_output(reference, ref_path, 'shared statements to be referenced') + _dump_csv_output( + wd_stmts, wd_stmts_path, + f'statements in Wikidata but not in {catalog} {entity}' + ) # Dump Wikidata cache if dump_wikidata: @@ -502,7 +514,7 @@ def dead_ids( def links( catalog: str, entity: str, url_blacklist=False, wd_cache=None -) -> Union[Tuple[defaultdict, list, list, list, list, dict], Tuple[None, None, None, None, None, None]]: +) -> Union[Tuple[defaultdict, list, list, list, list, list, dict], Tuple[None, None, None, None, None, None, None]]: """Validate identifiers against available links. Also generate statements based on additional links @@ -531,23 +543,25 @@ def links( of URL domains. Default: ``False`` :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata in a previous run. Default: ``None`` - :return: ``tuple`` of 6 objects + :return: 7 objects 1. ``dict`` of identifiers that should be deprecated 2. ``list`` of third-party identifiers that should be added 3. ``list`` of URLs that should be added 4. ``list`` of third-party identifiers that should be referenced 5. ``list`` of URLs that should be referenced - 6. ``dict`` of links gathered from Wikidata + 6. ``list`` of URLs found in Wikidata but not in the target catalog + 7. ``dict`` of links gathered from Wikidata """ # Target catalog side first: # enable early return in case of no target links target_links = data_gathering.gather_target_links(entity, catalog) if target_links is None: - return None, None, None, None, None, None + return None, None, None, None, None, None, None - deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set) + deprecate, add = defaultdict(set), defaultdict(set) + reference, wd_only = defaultdict(set), defaultdict(set) # Wikidata side url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids() @@ -566,9 +580,12 @@ def links( wd_links = wd_cache # Validation - _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference) + _validate( + keys.LINKS, wd_links, target_links, + deprecate, add, reference, wd_only + ) - # Links to be added: + # URLs to be added: # 1. Separate external IDs from URLs add_ext_ids, add_urls = data_gathering.extract_ids_from_urls( add, ext_id_pids_to_urls @@ -577,30 +594,46 @@ def links( if url_blacklist: add_urls = _apply_url_blacklist(add_urls) - # Links to be referenced: separate external IDs from URLs + # URLs to be referenced: separate external IDs from URLs ref_ext_ids, ref_urls = data_gathering.extract_ids_from_urls( reference, ext_id_pids_to_urls ) + # Wikidata-only URLs: convert into a list of statements + wd_only_urls = [] + for (qid, tid), urls in wd_only.items(): + for url in urls: + wd_only_urls.append((tid, url, qid)) + LOGGER.info( 'Validation completed. Target: %s %s. ' 'IDs to be deprecated: %d. ' 'Third-party IDs to be added: %d. ' - 'URL statements to be added: %d', + 'URL statements to be added: %d. ' 'Third-party IDs to be referenced: %d. ' - 'URL statements to be referenced: %d', + 'URL statements to be referenced: %d. ' + 'URL in Wikidata but not in the target: %d', catalog, entity, len(deprecate), - len(add_ext_ids), len(add_urls), - len(ref_ext_ids), len(ref_urls) + len(add_ext_ids), + len(add_urls), + len(ref_ext_ids), + len(ref_urls), + len(wd_only_urls) ) - return deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_links + return ( + deprecate, + add_ext_ids, add_urls, + ref_ext_ids, ref_urls, + wd_only_urls, + wd_links + ) def bio( catalog: str, entity: str, wd_cache=None -) -> Union[Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None]]: +) -> Union[Tuple[defaultdict, Iterator, Iterator, Iterator, dict], Tuple[None, None, None, None, None]]: """Validate identifiers against available biographical data. Look for: @@ -631,21 +664,23 @@ def bio( A supported entity :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata in a previous run - :return: a ``tuple`` of 4 objects + :return: 5 objects 1. ``dict`` of identifiers that should be deprecated 2. ``generator`` of statements that should be added 3. ``generator`` of shared statements that should be referenced - 4. ``dict`` of biographical data gathered from Wikidata + 4. ``generator`` of statements found in Wikidata but not in the target catalog + 5. ``dict`` of biographical data gathered from Wikidata """ # Target catalog side first: # enable early return in case of no target data target_bio = data_gathering.gather_target_biodata(entity, catalog) if target_bio is None: - return None, None, None, None + return None, None, None, None, None - deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set) + deprecate, add = defaultdict(set), defaultdict(set) + reference, wd_only = defaultdict(set), defaultdict(set) # Wikidata side if wd_cache is None: @@ -661,14 +696,16 @@ def bio( wd_bio = wd_cache # Validation - _validate( - keys.BIODATA, - wd_bio, target_bio, - deprecate, add, reference + _validate(keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only) + + return ( + deprecate, + _bio_statements_generator(add), + _bio_statements_generator(reference), + _bio_statements_generator(wd_only, qid_first=False), + wd_bio ) - return deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), wd_bio - def _apply_url_blacklist(url_statements): LOGGER.info('Applying URL blacklist ...') @@ -691,13 +728,16 @@ def _apply_url_blacklist(url_statements): return url_statements -def _bio_statements_generator(stmts_dict): +def _bio_statements_generator(stmts_dict, qid_first=True): for (qid, tid), values in stmts_dict.items(): for pid, value in values: - yield qid, pid, value, tid + if qid_first: + yield qid, pid, value, tid + else: + yield tid, pid, value, qid -def _validate(criterion, wd, target_generator, deprecate, add, reference): +def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_only): LOGGER.info('Starting check against target %s ...', criterion) target = _consume_target_generator(target_generator) @@ -732,11 +772,11 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference): ) continue - shared_data, extra_data = _compute_shared_and_extra( + shared_set, extra_set, wd_only_set = _compute_comparison_sets( criterion, wd_data, target_data ) - if not shared_data: + if not shared_set: LOGGER.debug( 'No shared %s between %s and %s. The identifier ' 'statement should be deprecated', @@ -751,37 +791,42 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference): qid, tid, criterion, - shared_data, + shared_set, ) - reference[(qid, tid)].update(shared_data) + reference[(qid, tid)].update(shared_set) - if extra_data: + if extra_set: LOGGER.debug( '%s has extra %s that should be added to %s: %s', - tid, - criterion, - qid, - extra_data, + tid, criterion, qid, extra_set ) - add[(qid, tid)].update(extra_data) + add[(qid, tid)].update(extra_set) else: LOGGER.debug('%s has no extra %s', tid, criterion) + if wd_only_set: + LOGGER.debug('%s has %s not in %s: %s', qid, criterion, tid, wd_only_set) + wd_only[(qid, tid)].update(wd_only_set) + else: + LOGGER.debug('%s has no extra %s', qid, criterion) + LOGGER.info( 'Check against target %s completed: %d IDs to be deprecated, ' - '%d Wikidata items with statements to be added, ', - '%d Wikidata items with shared statements to be referenced', - criterion, - len(deprecate), + '%d Wikidata items with statements to be added, ' + '%d Wikidata items with shared statements to be referenced, ' + '%d values in Wikidata but not in the target catalog', + criterion, len(deprecate), len(add), len(reference), + len(wd_only) ) -def _compute_shared_and_extra(criterion, wd_data, target_data): +def _compute_comparison_sets(criterion, wd_data, target_data): if criterion == keys.LINKS: shared = wd_data.intersection(target_data) extra = target_data.difference(wd_data) + wd_only = wd_data.difference(target_data) # Biographical validation requires more complex comparisons elif criterion == keys.BIODATA: # `wd_data` has either couples or triples: couples are dates @@ -795,16 +840,22 @@ def _compute_shared_and_extra(criterion, wd_data, target_data): )) target_other = target_data.difference(target_dates) shared_dates, extra_dates = _compare('dates', wd_dates, target_dates) + wd_only_dates = wd_dates.difference(shared_dates) shared_other, extra_other = _compare('other', wd_other, target_other) + # `wd_other` has triples: build a set with couples + # to directly compute the difference with `shared_other` + wd_other_set = {(pid, qid) for pid, qid, _ in wd_other} + wd_only_other = wd_other_set.difference(shared_other) shared = shared_dates | shared_other extra = extra_dates | extra_other + wd_only = wd_only_dates | wd_only_other else: raise ValueError( f"Invalid validation criterion: '{criterion}'. " f"Please use either '{keys.LINKS}' or '{keys.BIODATA}'" ) - return shared, extra + return shared, extra, wd_only def _compare(what, wd, target): @@ -935,12 +986,12 @@ def _dump_deprecated(data, outpath): LOGGER.info("No IDs to be deprecated, won't dump to file") -def _dump_csv_output(data, outpath, log_msg_subject): +def _dump_csv_output(data, out_path, log_msg_subject): if data: - with open(outpath, 'w') as ids_out: - writer = csv.writer(ids_out) + with open(out_path, 'w') as fout: + writer = csv.writer(fout) writer.writerows(data) - LOGGER.info('%s dumped to %s', log_msg_subject, outpath) + LOGGER.info('%s dumped to %s', log_msg_subject, out_path) else: LOGGER.info("No %s, won't dump to file", log_msg_subject) From 0dc09dea6321ad718a530413f1d1c23cfdc4234d Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Tue, 17 Aug 2021 10:22:51 +0000 Subject: [PATCH 3/9] refactor WD-only statements file name --- soweego/validator/checks.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 4425079a..1f5c00c9 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -33,15 +33,14 @@ WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl' IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json' SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv' +WD_STATEMENTS_FNAME = 'wikidata_{criterion}_for_{catalog}_{entity}.csv' # For `dead_ids_cli` DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json' # For `links_cli` EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv' URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv' -WD_URLS_FNAME = 'wikidata_urls_for_{catalog}_{entity}.txt.gz' # For `bio_cli` BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv' -WD_STATEMENTS_FNAME = 'wikidata_statements_for_{catalog}_{entity}.csv.gz' @click.command() @@ -222,8 +221,8 @@ def links_cli( ) ) wd_urls_path = os.path.join( - dir_io, WD_URLS_FNAME.format( - catalog=catalog, entity=entity + dir_io, WD_STATEMENTS_FNAME.format( + criterion=criterion, catalog=catalog, entity=entity ) ) wd_cache_path = os.path.join( @@ -366,7 +365,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): ) wd_stmts_path = os.path.join( dir_io, WD_STATEMENTS_FNAME.format( - catalog=catalog, entity=entity + criterion=criterion, catalog=catalog, entity=entity ) ) wd_cache_path = os.path.join( @@ -406,7 +405,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): # version should be the most efficient solution pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) LOGGER.info( - 'Biographical data gathered from Wikidata dumped to %s', wd_cache_path + 'Biographical data gathered from Wikidata dumped to %s', wd_cache_path ) except MemoryError: LOGGER.warning('Could not pickle the Wikidata cache: memory error') From 9f30cd9213d337e136ae5191b0030748e4dcc88d Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Tue, 17 Aug 2021 16:48:12 +0000 Subject: [PATCH 4/9] avoid useless computation: pre-filter missing non-date target values --- soweego/validator/checks.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 1f5c00c9..68f536fe 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -830,7 +830,7 @@ def _compute_comparison_sets(criterion, wd_data, target_data): elif criterion == keys.BIODATA: # `wd_data` has either couples or triples: couples are dates wd_dates = set(filter(lambda x: len(x) == 2, wd_data)) - # No cast to `set` because `wd_data` triples hold sets themselves + # Don't cast to set: `wd_data` triples hold sets themselves wd_other = list(filter(lambda x: len(x) == 3, wd_data)) # In `target_data` we look for relevant date PIDs target_dates = set(filter( @@ -865,6 +865,10 @@ def _compare(what, wd, target): # the same property wd_matches, target_matches = [], [] + # Filter missing target values (doesn't apply to dates) + if what == 'other': + target = filter(lambda x: x[1] is not None, target) + for i, wd_elem in enumerate(wd): for j, t_elem in enumerate(target): # Don't compare when already matched @@ -875,19 +879,18 @@ def _compare(what, wd, target): if wd_elem[0] != t_elem[0]: continue - # Skip unexpected `None` values - if None in (wd_elem[1], t_elem[1]): - LOGGER.warning( - 'Skipping unexpected %s pair with missing value(s)', - (wd_elem, t_elem), - ) - continue - inputs = ( shared, extra, wd_matches, target_matches, i, wd_elem, j, t_elem ) if what == 'dates': + # Missing dates are unexpected: skip but warn + if None in (wd_elem[1], t_elem[1]): + LOGGER.warning( + 'Skipping unexpected %s date pair with missing value(s)', + (wd_elem, t_elem), + ) + continue _compare_dates(inputs) elif what == 'other': _compare_other(inputs) From 6e39c34d0cd14b73ef2c85743739883537ba6803 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Wed, 18 Aug 2021 10:54:24 +0000 Subject: [PATCH 5/9] dump full Wikidata URLs for catalog providers --- soweego/validator/checks.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 68f536fe..bce5655b 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -15,6 +15,7 @@ import os import pickle from collections import defaultdict +from re import match from typing import DefaultDict, Dict, Iterator, Tuple, Union import click @@ -32,6 +33,7 @@ # For all CLIs WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl' IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json' +# For `links_cli` and `bio_cli` SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv' WD_STATEMENTS_FNAME = 'wikidata_{criterion}_for_{catalog}_{entity}.csv' # For `dead_ids_cli` @@ -42,6 +44,10 @@ # For `bio_cli` BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv' +# URL prefixes for catalog providers +QID_PREFIX = 'https://www.wikidata.org/wiki/' +PID_PREFIX = QID_PREFIX + 'Property:' + @click.command() @click.argument( @@ -187,7 +193,7 @@ def links_cli( Same format as file #3 6. URLs found in Wikidata but not in the target catalog. - CSV format: URL,QID + CSV format: catalog_ID,URL,QID_URL You can pass the '-u' flag to upload the output to Wikidata. @@ -343,7 +349,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): Same format as file #2 4. statements found in Wikidata but not in the target catalog. - CSV format: catalog_ID,PID,value,QID + CSV format: catalog_ID,PID_URL,value,QID_URL You can pass the '-u' flag to upload the output to Wikidata. """ @@ -599,10 +605,11 @@ def links( ) # Wikidata-only URLs: convert into a list of statements + # with complete Wikidata item URLs wd_only_urls = [] for (qid, tid), urls in wd_only.items(): for url in urls: - wd_only_urls.append((tid, url, qid)) + wd_only_urls.append((tid, url, QID_PREFIX + qid)) LOGGER.info( 'Validation completed. Target: %s %s. ' @@ -701,7 +708,7 @@ def bio( deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), - _bio_statements_generator(wd_only, qid_first=False), + _bio_statements_generator(wd_only, for_catalogs=True), wd_bio ) @@ -727,13 +734,15 @@ def _apply_url_blacklist(url_statements): return url_statements -def _bio_statements_generator(stmts_dict, qid_first=True): +def _bio_statements_generator(stmts_dict, for_catalogs=False): for (qid, tid), values in stmts_dict.items(): for pid, value in values: - if qid_first: + if not for_catalogs: yield qid, pid, value, tid else: - yield tid, pid, value, qid + if match(constants.QID_REGEX, value): + value = QID_PREFIX + value + yield tid, PID_PREFIX + pid, value, QID_PREFIX + qid def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_only): From 2a8811a528b5299edb484b1346963f95d227d8c3 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Fri, 20 Aug 2021 14:48:32 +0000 Subject: [PATCH 6/9] fix bad merge conflict resolution --- soweego/validator/checks.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 4950c7fb..32916dec 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -255,14 +255,9 @@ def links_cli( catalog, entity, blacklist, wd_cache=wd_cache ) else: - ( - deprecate, - add_ext_ids, - add_urls, - ref_ext_ids, - ref_urls, - wd_cache, - ) = links(catalog, entity, blacklist) + deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = links( + catalog, entity, blacklist + ) # Nothing to do: the catalog doesn't contain links if deprecate is None: From 6a02411712c388fcc63863e1c13f706bcb565791 Mon Sep 17 00:00:00 2001 From: travis Date: Mon, 23 Aug 2021 11:01:59 +0000 Subject: [PATCH 7/9] format code & organize imports --- soweego/validator/checks.py | 93 +++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 32916dec..d6950cf3 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -234,9 +234,10 @@ def links_cli( URLS_FNAME.format(catalog=catalog, entity=entity, task='referenced'), ) wd_urls_path = os.path.join( - dir_io, WD_STATEMENTS_FNAME.format( + dir_io, + WD_STATEMENTS_FNAME.format( criterion=criterion, catalog=catalog, entity=entity - ) + ), ) wd_cache_path = os.path.join( dir_io, @@ -251,13 +252,25 @@ def links_cli( wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache - deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, _ = links( - catalog, entity, blacklist, wd_cache=wd_cache - ) + ( + deprecate, + add_ext_ids, + add_urls, + ref_ext_ids, + ref_urls, + wd_urls, + _, + ) = links(catalog, entity, blacklist, wd_cache=wd_cache) else: - deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = links( - catalog, entity, blacklist - ) + ( + deprecate, + add_ext_ids, + add_urls, + ref_ext_ids, + ref_urls, + wd_urls, + wd_cache, + ) = links(catalog, entity, blacklist) # Nothing to do: the catalog doesn't contain links if deprecate is None: @@ -273,7 +286,9 @@ def links_cli( ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced' ) _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') - _dump_csv_output(wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}') + _dump_csv_output( + wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}' + ) # Dump Wikidata cache if dump_wikidata: @@ -385,9 +400,10 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): ), ) wd_stmts_path = os.path.join( - dir_io, WD_STATEMENTS_FNAME.format( + dir_io, + WD_STATEMENTS_FNAME.format( criterion=criterion, catalog=catalog, entity=entity - ) + ), ) wd_cache_path = os.path.join( dir_io, @@ -402,7 +418,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache - deprecate, add, reference, wd_stmts, _ = bio(catalog, entity, wd_cache=wd_cache) + deprecate, add, reference, wd_stmts, _ = bio( + catalog, entity, wd_cache=wd_cache + ) else: deprecate, add, reference, wd_stmts, wd_cache = bio(catalog, entity) @@ -415,8 +433,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): _dump_csv_output(add, add_path, 'statements to be added') _dump_csv_output(reference, ref_path, 'shared statements to be referenced') _dump_csv_output( - wd_stmts, wd_stmts_path, - f'statements in Wikidata but not in {catalog} {entity}' + wd_stmts, + wd_stmts_path, + f'statements in Wikidata but not in {catalog} {entity}', ) # Dump Wikidata cache @@ -428,7 +447,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) LOGGER.info( 'Biographical data gathered from Wikidata dumped to %s', - wd_cache_path + wd_cache_path, ) except MemoryError: LOGGER.warning('Could not pickle the Wikidata cache: memory error') @@ -601,8 +620,7 @@ def links( # Validation _validate( - keys.LINKS, wd_links, target_links, - deprecate, add, reference, wd_only + keys.LINKS, wd_links, target_links, deprecate, add, reference, wd_only ) # URLs to be added: @@ -634,21 +652,24 @@ def links( 'Third-party IDs to be referenced: %d. ' 'URL statements to be referenced: %d. ' 'URL in Wikidata but not in the target: %d', - catalog, entity, + catalog, + entity, len(deprecate), len(add_ext_ids), len(add_urls), len(ref_ext_ids), len(ref_urls), - len(wd_only_urls) + len(wd_only_urls), ) return ( deprecate, - add_ext_ids, add_urls, - ref_ext_ids, ref_urls, + add_ext_ids, + add_urls, + ref_ext_ids, + ref_urls, wd_only_urls, - wd_links + wd_links, ) @@ -717,14 +738,16 @@ def bio( wd_bio = wd_cache # Validation - _validate(keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only) + _validate( + keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only + ) return ( deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), _bio_statements_generator(wd_only, for_catalogs=True), - wd_bio + wd_bio, ) @@ -758,7 +781,9 @@ def _bio_statements_generator(stmts_dict, for_catalogs=False): yield tid, PID_PREFIX + pid, value, QID_PREFIX + qid -def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_only): +def _validate( + criterion, wd, target_generator, deprecate, add, reference, wd_only +): LOGGER.info('Starting check against target %s ...', criterion) target = _consume_target_generator(target_generator) @@ -819,14 +844,23 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_onl if extra_set: LOGGER.debug( '%s has extra %s that should be added to %s: %s', - tid, criterion, qid, extra_set + tid, + criterion, + qid, + extra_set, ) add[(qid, tid)].update(extra_set) else: LOGGER.debug('%s has no extra %s', tid, criterion) if wd_only_set: - LOGGER.debug('%s has %s not in %s: %s', qid, criterion, tid, wd_only_set) + LOGGER.debug( + '%s has %s not in %s: %s', + qid, + criterion, + tid, + wd_only_set, + ) wd_only[(qid, tid)].update(wd_only_set) else: LOGGER.debug('%s has no extra %s', qid, criterion) @@ -836,10 +870,11 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_onl '%d Wikidata items with statements to be added, ' '%d Wikidata items with shared statements to be referenced, ' '%d values in Wikidata but not in the target catalog', - criterion, len(deprecate), + criterion, + len(deprecate), len(add), len(reference), - len(wd_only) + len(wd_only), ) From 6aca8b9457acac3703b10dfc11e8d1f992359db6 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Thu, 2 Sep 2021 16:31:48 +0000 Subject: [PATCH 8/9] return a single None to simplify code and comply with type hints. Tackles #428#discussion_r699422874 --- soweego/validator/checks.py | 61 ++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index d6950cf3..aa7fbdca 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -246,36 +246,22 @@ def links_cli( ), ) - # Handle Wikidata cache + # Wikidata cache + wd_cache = None if os.path.isfile(wd_cache_path): with open(wd_cache_path, 'rb') as cin: wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) - # Discard the last return value: Wikidata cache - ( - deprecate, - add_ext_ids, - add_urls, - ref_ext_ids, - ref_urls, - wd_urls, - _, - ) = links(catalog, entity, blacklist, wd_cache=wd_cache) - else: - ( - deprecate, - add_ext_ids, - add_urls, - ref_ext_ids, - ref_urls, - wd_urls, - wd_cache, - ) = links(catalog, entity, blacklist) + + # Run validation + result = links(catalog, entity, url_blacklist=blacklist, wd_cache=wd_cache) # Nothing to do: the catalog doesn't contain links - if deprecate is None: + if result is None: return + # Unpack the result tuple + deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = result # Dump output files _dump_deprecated(deprecate, deprecate_path) _dump_csv_output( @@ -383,6 +369,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): You can pass the '-u' flag to upload the output to Wikidata. """ criterion = 'bio' + # Output paths deprecate_path = os.path.join( dir_io, IDS_TO_BE_DEPRECATED_FNAME.format( @@ -412,22 +399,22 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): ), ) - # Handle Wikidata cache + # Wikidata cache + wd_cache = None if os.path.isfile(wd_cache_path): with open(wd_cache_path, 'rb') as cin: wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) - # Discard the last return value: Wikidata cache - deprecate, add, reference, wd_stmts, _ = bio( - catalog, entity, wd_cache=wd_cache - ) - else: - deprecate, add, reference, wd_stmts, wd_cache = bio(catalog, entity) + + # Run validation + result = bio(catalog, entity, wd_cache=wd_cache) # Nothing to do: the catalog doesn't contain biographical data - if deprecate is None: + if result is None: return + # Unpack the result tuple + deprecate, add, reference, wd_stmts, wd_cache = result # Dump output files _dump_deprecated(deprecate, deprecate_path) _dump_csv_output(add, add_path, 'statements to be added') @@ -557,12 +544,12 @@ def links( """Validate identifiers against available links. Also generate statements based on additional links - found in the given catalog. + found in the target catalog. They can be used to enrich Wikidata items. **How it works:** - 1. gather links from the given catalog + 1. gather links from the target catalog 2. gather links from relevant Wikidata items 3. look for shared links between pairs of Wikidata and catalog items: @@ -592,12 +579,14 @@ def links( 6. ``list`` of URLs found in Wikidata but not in the target catalog 7. ``dict`` of links gathered from Wikidata + or ``None`` if the target catalog has no links. + """ # Target catalog side first: # enable early return in case of no target links target_links = data_gathering.gather_target_links(entity, catalog) if target_links is None: - return None, None, None, None, None, None, None + return None deprecate, add = defaultdict(set), defaultdict(set) reference, wd_only = defaultdict(set), defaultdict(set) @@ -685,7 +674,7 @@ def bio( - gender Also generate statements based on additional data - found in the given catalog. + found in the target catalog. They can be used to enrich Wikidata items. **How it works:** @@ -714,12 +703,14 @@ def bio( 4. ``generator`` of statements found in Wikidata but not in the target catalog 5. ``dict`` of biographical data gathered from Wikidata + or ``None`` if the target catalog has no biographical data. + """ # Target catalog side first: # enable early return in case of no target data target_bio = data_gathering.gather_target_biodata(entity, catalog) if target_bio is None: - return None, None, None, None, None + return None deprecate, add = defaultdict(set), defaultdict(set) reference, wd_only = defaultdict(set), defaultdict(set) From 678995e13b1ff24d0674c6c8f2e8b70fed0fefc5 Mon Sep 17 00:00:00 2001 From: travis Date: Thu, 2 Sep 2021 16:35:09 +0000 Subject: [PATCH 9/9] format code & organize imports --- soweego/validator/checks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index aa7fbdca..d5cfac6e 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -261,7 +261,15 @@ def links_cli( return # Unpack the result tuple - deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = result + ( + deprecate, + add_ext_ids, + add_urls, + ref_ext_ids, + ref_urls, + wd_urls, + wd_cache, + ) = result # Dump output files _dump_deprecated(deprecate, deprecate_path) _dump_csv_output(