From 31fab3ae1718a8dcd5ff328d434752428ed5137c Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Mon, 2 Aug 2021 15:43:30 +0000 Subject: [PATCH 01/22] update & refurbish CLI docstrings --- soweego/validator/checks.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 9a293e48..d0e96bdb 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -157,11 +157,14 @@ def links_cli( Dump 3 output files: - 1. target identifiers to be deprecated. Format: (JSON) {identifier: [list of QIDs]} + 1. catalog IDs to be deprecated. JSON format: + {catalog_ID: [list of QIDs]} - 2. third-party identifiers to be added. Format: (CSV) QID,identifier_PID,identifier + 2. third-party IDs to be added. CSV format: + QID,third-party_PID,third-party_ID,catalog_ID - 3. URLs to be added. Format: (CSV) QID,P973,URL + 3. URLs to be added. CSV format: + QID,P973,URL,catalog_ID You can pass the '-u' flag to upload the output to Wikidata. @@ -257,9 +260,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): Dump 2 output files: - 1. target identifiers to be deprecated. Format: (JSON) {identifier: [list of QIDs]} + 1. catalog IDs to be deprecated. JSON format: + {catalog_ID: [list of QIDs]} - 2. statements to be added. Format: (CSV) QID,metadata_PID,value + 2. statements to be added. CSV format: + QID,PID,value,catalog_ID You can pass the '-u' flag to upload the output to Wikidata. """ From 2d79ff2790e30aedcc72c031bcf781e335354733 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Wed, 4 Aug 2021 13:25:07 +0000 Subject: [PATCH 02/22] add record linkage item to reference non-machine learning bot edits --- soweego/wikidata/vocabulary.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/soweego/wikidata/vocabulary.py b/soweego/wikidata/vocabulary.py index d990dc1e..7219e5fc 100644 --- a/soweego/wikidata/vocabulary.py +++ b/soweego/wikidata/vocabulary.py @@ -23,11 +23,15 @@ INSTANCE_OF = 'P31' OCCUPATION = 'P106' -# References nodes terms +# References node terms # 'based on heuristic' was introduced upon community discussion # See https://github.com/Wikidata/soweego/issues/373 BASED_ON_HEURISTIC = 'P887' +# Main task: the linker uses machine learning ARTIFICIAL_INTELLIGENCE = 'Q11660' +# Validator tasks: no machine learning +# See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005) +RECORD_LINKAGE = 'Q1266546' STATED_IN = 'P248' RETRIEVED = 'P813' From 89528268b2efd48a9348db0df161e304258ddf2d Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Wed, 4 Aug 2021 13:26:52 +0000 Subject: [PATCH 03/22] [WIP] start refactoring to parametrize the P887 heuristic --- soweego/ingester/wikidata_bot.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index 42c00386..b96742e8 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -61,11 +61,13 @@ ### # Approved task 1: identifiers addition # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot -IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]' +IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] ' +'with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]' # Approved task 2: URL-based validation, criterion 2 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_2 -URL_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] with extra P887 and catalog ID reference' +URL_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] ' +'with extra P887 and catalog ID reference' # Approved task 3: works by people # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_3 @@ -198,7 +200,7 @@ def people_cli(catalog, statements, sandbox): claim (Joey Ramone, member of, Ramones) - reference (based on heuristic, artificial intelligence), + reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 264375), (retrieved, today) @@ -789,17 +791,20 @@ def _add( ) -def _reference(claim, catalog_qid, person_pid, person_tid, summary=None): +def _reference( + claim, catalog_qid, person_pid, person_tid, heuristic, + summary=None +): # Reference node # create `pywikibot.Claim` instances at runtime: # pywikibot would cry if the same instances get uploaded multiple times # over the same item - # (based on heuristic, artificial intelligence) reference claim + # (based on heuristic, `heuristic`) reference claim: depends on the task based_on_heuristic_reference = pywikibot.Claim( REPO, vocabulary.BASED_ON_HEURISTIC, is_reference=True ) based_on_heuristic_reference.setTarget( - pywikibot.ItemPage(REPO, vocabulary.ARTIFICIAL_INTELLIGENCE) + pywikibot.ItemPage(REPO, heuristic) ) # (stated in, CATALOG) reference claim stated_in_reference = pywikibot.Claim( @@ -814,7 +819,7 @@ def _reference(claim, catalog_qid, person_pid, person_tid, summary=None): if None in (person_pid, person_tid,): reference_log = ( - f'({based_on_heuristic_reference.getID()}, {vocabulary.ARTIFICIAL_INTELLIGENCE}), ' + f'({based_on_heuristic_reference.getID()}, {heuristic}), ' f'({stated_in_reference.getID()}, {catalog_qid}), ' f'({retrieved_reference.getID()}, {TODAY})' ) @@ -840,7 +845,7 @@ def _reference(claim, catalog_qid, person_pid, person_tid, summary=None): tid_reference.setTarget(person_tid) reference_log = ( - f'({based_on_heuristic_reference.getID()}, {vocabulary.ARTIFICIAL_INTELLIGENCE}), ' + f'({based_on_heuristic_reference.getID()}, {heuristic}), ' f'({stated_in_reference.getID()}, {catalog_qid}), ' f'({person_pid}, {person_tid}), ' f'({retrieved_reference.getID()}, {TODAY})' From 17d62362506cc4bad9c000424431241d5fd7e7c3 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Wed, 4 Aug 2021 13:31:51 +0000 Subject: [PATCH 04/22] close #406 ; fix WD cache loading bug; don't remove dates from the original data; ensure complete comparison of dates --- soweego/validator/checks.py | 88 +++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index d0e96bdb..ae0a4813 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -550,8 +550,8 @@ def _apply_url_blacklist(url_statements): # O(nm) complexity: n = len(blacklist); m = len(url_statements) # Expected order of magnitude: n = 10^2; m = 10^5 for domain in blacklist: # 10^2 - url_statements = list( - filter( # Slurp the filter or it won't work + url_statements = list( # Slurp the filter or it won't work + filter( lambda stmt: domain not in stmt[2], url_statements # 10^5 ) ) @@ -564,9 +564,9 @@ def _apply_url_blacklist(url_statements): def _bio_to_be_added_generator(to_be_added): - for qid, values in to_be_added.items(): + for (qid, tid,), values in to_be_added.items(): for pid, value in values: - yield qid, pid, value + yield (qid, pid, value, tid,) def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): @@ -650,11 +650,11 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): def _compute_shared_and_extra(criterion, wd_data, target_data): # Properly compare dates when checking biographical data if criterion == keys.BIODATA: - wd_dates = _extract_dates(wd_data) - target_dates = _extract_dates(target_data) + wd_dates, wd_other = _extract_dates(wd_data) + target_dates, target_other = _extract_dates(target_data) shared_dates, extra_dates = _compare_dates(wd_dates, target_dates) - shared = wd_data.intersection(target_data).union(shared_dates) - extra = target_data.difference(wd_data).union(extra_dates) + shared = wd_other.intersection(target_other).union(shared_dates) + extra = target_other.difference(wd_other).union(extra_dates) else: shared = wd_data.intersection(target_data) extra = target_data.difference(wd_data) @@ -667,49 +667,50 @@ def _extract_dates(data): for pid, value in data: if pid in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH): dates.add((pid, value)) - # Remove dates from input set - data.difference_update(dates) - return dates + # Separate dates from other data + return dates, data.difference(dates) def _compare_dates(wd, target): + # Ensure unique comparisons, regardless of different precisions. + # For instance: + # `wd` has '1986-01-01/9' and '1986-11-29/11' + # `target` has '1986-01-01/9' + # `shared_dates` will have one element shared_dates, extra_dates = set(), set() - for wd_elem, t_elem in zip_longest(wd, target): - # Skip pair with None elements - if None in (wd_elem, t_elem): - continue - - wd_pid, wd_val = wd_elem - t_pid, t_val = t_elem + for wd_elem in wd: + for t_elem in target: + wd_pid, wd_val = wd_elem + t_pid, t_val = t_elem - # Don't compare birth with death dates - if wd_pid != t_pid: - continue + # Don't compare birth with death dates + if wd_pid != t_pid: + continue - # Skip unexpected None values - if None in (wd_val, t_val): - LOGGER.warning( - 'Skipping unexpected %s date pair with missing value(s)', - (wd_elem, t_elem), - ) - continue + # Skip unexpected `None` values + if None in (wd_val, t_val): + LOGGER.warning( + 'Skipping unexpected %s date pair with missing value(s)', + (wd_elem, t_elem), + ) + continue - wd_timestamp, wd_precision = wd_val.split('/') - t_timestamp, t_precision = t_val.split('/') + wd_timestamp, wd_precision = wd_val.split('/') + t_timestamp, t_precision = t_val.split('/') - shared_date, extra_date = _match_dates_by_precision( - min(int(wd_precision), int(t_precision)), - wd_elem, - wd_timestamp, - t_elem, - t_timestamp, - ) + shared_date, extra_date = _match_dates_by_precision( + min(int(wd_precision), int(t_precision)), + wd_elem, + wd_timestamp, + t_elem, + t_timestamp, + ) - if shared_date is not None: - shared_dates.add(shared_date) - if extra_date is not None: - extra_dates.add(extra_date) + if shared_date is not None: + shared_dates.add(shared_date) + if extra_date is not None: + extra_dates.add(extra_date) return shared_dates, extra_dates @@ -740,6 +741,7 @@ def _match_dates_by_precision( (wd_timestamp, t_timestamp), (wd_simplified, t_simplified), ) + # WD data has the priority shared = wd_elem else: LOGGER.debug('Target has an extra date: %s', t_timestamp) @@ -791,12 +793,11 @@ def _dump_csv_output(data, outpath, log_msg_subject): def _load_wd_cache(file_handle): raw_cache = json.load(file_handle) - LOGGER.info("Loaded Wikidata cache from '%s'", file_handle.name) cache = {} for qid, data in raw_cache.items(): for data_type, value_list in data.items(): # Biodata has values that are a list - if isinstance(value_list[0], list): + if value_list and isinstance(value_list[0], list): value_set = set() for value in value_list: if isinstance(value[1], list): @@ -814,6 +815,7 @@ def _load_wd_cache(file_handle): cache[qid][data_type] = set(value_list) else: cache[qid] = {data_type: set(value_list)} + LOGGER.info("Loaded Wikidata cache from '%s'", file_handle.name) return cache From b35e1ecae1fc76fc3eff74a318c86ea607a892a1 Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Thu, 5 Aug 2021 16:12:03 +0200 Subject: [PATCH 05/22] [WIP] huge refactoring --- soweego/ingester/wikidata_bot.py | 557 ++++++++++++++----------------- 1 file changed, 253 insertions(+), 304 deletions(-) diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index b96742e8..3563251c 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -2,17 +2,17 @@ # -*- coding: utf-8 -*- """A `Wikidata bot `_ that adds, deletes, or deprecates referenced statements. -Here are typical output examples. +Here are typical output examples: :func:`add_identifiers` | *Claim:* `Joey Ramone `_, `Discogs artist ID `_, `264375 `_ - | *Reference:* `stated in `_, `Discogs `_), (`retrieved `_, TIMESTAMP + | *Reference:* (`based on heuristic `_, `artificial intelligence `_), (`retrieved `_, TIMESTAMP) :func:`add_people_statements` | *Claim:* `Joey Ramone `_, `member of `_, `Ramones `_ - | *Reference:* `stated in `_, `Discogs `_), (`retrieved `_, TIMESTAMP + | *Reference:* (`based on heuristic `_, `record linkage `_),`(stated in `_, `Discogs `_), (`Discogs artist ID `_, `264375 `_), (`retrieved `_, TIMESTAMP) :func:`add_works_statements` | *Claim:* `Leave Home `_, `performer `_, `Ramones `_ - | *Reference:* `stated in `_, `Discogs `_), (`Discogs artist ID `_, `264375 `_), (`retrieved `_, TIMESTAMP + | *Reference:* (`based on heuristic `_, `record linkage `_),`(stated in `_, `Discogs `_), (`Discogs artist ID `_, `264375 `_), (`retrieved `_, TIMESTAMP) :func:`delete_or_deprecate_identifiers` deletes or deprecates identifier statements. @@ -20,9 +20,9 @@ __author__ = 'Marco Fossati' __email__ = 'fossati@spaziodati.eu' -__version__ = '1.0' +__version__ = '2.0' __license__ = 'GPL-3.0' -__copyright__ = 'Copyleft 2018, Hjfocs' +__copyright__ = 'Copyleft 2021, Hjfocs' import csv import json @@ -46,27 +46,17 @@ SITE = pywikibot.Site('wikidata', 'wikidata') REPO = SITE.data_repository() -# Time stamp object for the (retrieved, TIMESTAMP) reference -TODAY = date.today() -TIMESTAMP = pywikibot.WbTime( - site=REPO, - year=TODAY.year, - month=TODAY.month, - day=TODAY.day, - precision='day', -) - -### +####################### # BEGIN: Edit summaries -### +####################### # Approved task 1: identifiers addition # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] ' 'with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]' -# Approved task 2: URL-based validation, criterion 2 +# Approved task 2: URLs validation, criterion 2 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_2 -URL_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] ' +LINKS_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] ' 'with extra P887 and catalog ID reference' # Approved task 3: works by people @@ -74,9 +64,23 @@ WORKS_SUMMARY = ( '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_3|bot task 3]]' ) -### + +# Biographical data validation, criterion 3 +# TODO add wikilink once the bot task gets approved +BIO_VALIDATION_SUMMARY = 'bot task 4' +##################### # END: Edit summaries -### +##################### + +# Time stamp object for the (retrieved, TIMESTAMP) reference +TODAY = date.today() +TIMESTAMP = pywikibot.WbTime( + site=REPO, + year=TODAY.year, + month=TODAY.month, + day=TODAY.day, + precision='day', +) # We also support Twitter SUPPORTED_TARGETS = target_database.supported_targets() ^ {TWITTER} @@ -176,13 +180,20 @@ def identifiers_cli(catalog, entity, identifiers, sandbox): @click.command() @click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS)) @click.argument('statements', type=click.File()) +@click.option( + '-c', + '--criterion', + type=click.Choice(('links', 'bio')), + help='Validation criterion used to generate STATEMENTS. ' + 'Same as the command passed to `python -m soweego sync`' +) @click.option( '-s', '--sandbox', is_flag=True, help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) -def people_cli(catalog, statements, sandbox): +def people_cli(catalog, statements, criterion, sandbox): """Add statements to Wikidata people. STATEMENTS must be a CSV file. @@ -205,37 +216,35 @@ def people_cli(catalog, statements, sandbox): (Discogs artist ID, 264375), (retrieved, today) """ + sandbox_item = vocabulary.SANDBOX_2 + # See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005) + heuristic = vocabulary.RECORD_LINKAGE catalog_qid = target_database.get_catalog_qid(catalog) - person_pid = target_database.get_person_pid(catalog) + catalog_pid = target_database.get_person_pid(catalog) + + if criterion == 'links': + edit_summary = LINKS_VALIDATION_SUMMARY + elif criterion == 'bio': + edit_summary = BIO_VALIDATION_SUMMARY + else: + edit_summary = None if sandbox: LOGGER.info( - 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2 + 'Running on the Wikidata sandbox item %s ...', sandbox_item ) stmt_reader = csv.reader(statements) - for statement in stmt_reader: - person, predicate, value, person_tid = statement - if sandbox: - _add_or_reference( - vocabulary.SANDBOX_2, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=URL_VALIDATION_SUMMARY, - ) - else: - _add_or_reference( - person, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=URL_VALIDATION_SUMMARY, - ) + for person, predicate, value, catalog_id in stmt_reader: + subject = person if not sandbox else sandbox_item + _add_or_reference( + (subject, predicate, value), + heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary + ) @click.command() @@ -245,7 +254,7 @@ def people_cli(catalog, statements, sandbox): '-s', '--sandbox', is_flag=True, - help='Perform all edits on the Wikidata sandbox item Q4115189.', + help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) def works_cli(catalog, statements, sandbox): """Add statements to Wikidata works. @@ -265,37 +274,26 @@ def works_cli(catalog, statements, sandbox): claim (C'mon Everybody, performer, Eddie Cochran) - reference (based on heuristic, artificial intelligence), + reference (based on heuristic, record linkage), (Discogs artist ID, 139984), (retrieved, today) """ + sandbox_item = vocabulary.SANDBOX_2 + catalog_qid = target_database.get_catalog_qid(catalog) is_imdb, person_pid = _get_works_args(catalog) + heuristic = vocabulary.RECORD_LINKAGE if sandbox: - LOGGER.info('Running on the Wikidata sandbox item ...') + LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) stmt_reader = csv.reader(statements) - for statement in stmt_reader: - work, predicate, person, person_tid = statement - if sandbox: - _add_or_reference_works( - vocabulary.SANDBOX_1, - predicate, - person, - person_pid, - person_tid, - is_imdb=is_imdb, - summary=WORKS_SUMMARY, - ) - else: - _add_or_reference_works( - work, - predicate, - person, - person_pid, - person_tid, - is_imdb=is_imdb, - summary=WORKS_SUMMARY, - ) + for work, predicate, person, person_id in stmt_reader: + subject = work if not sandbox else sandbox_item + _add_or_reference_works( + (subject, predicate, person), + heuristic, + catalog_qid, person_pid, person_id, + is_imdb=is_imdb, edit_summary=WORKS_SUMMARY + ) def add_identifiers( @@ -312,27 +310,23 @@ def add_identifiers( :param sandbox: whether to perform edits on the `Wikidata sandbox `_ item """ + sandbox_item = vocabulary.SANDBOX_2 catalog_pid = target_database.get_catalog_pid(catalog, entity) + heuristic = vocabulary.ARTIFICIAL_INTELLIGENCE + + if sandbox: + LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) + for qid, tid in identifiers.items(): LOGGER.info('Processing %s match: %s -> %s', catalog, qid, tid) - if sandbox: - LOGGER.debug( - 'Using Wikidata sandbox item %s as subject, instead of %s', - vocabulary.SANDBOX_1, - qid, - ) - _add_or_reference( - vocabulary.SANDBOX_1, - catalog_pid, - tid, - summary=IDENTIFIERS_SUMMARY, - ) - else: - _add_or_reference( - qid, catalog_pid, tid, summary=IDENTIFIERS_SUMMARY - ) + subject = qid if not sandbox else sandbox_item + _add_or_reference( + (subject, catalog_pid, tid,), + heuristic, + edit_summary=IDENTIFIERS_SUMMARY) +# TODO handle edit summary def add_people_statements( catalog: str, statements: Iterable, sandbox: bool ) -> None: @@ -343,39 +337,34 @@ def add_people_statements( :func:`soweego.validator.checks.bio`. :param statements: iterable of - (subject, predicate, value, target ID) tuples + (subject, predicate, value, catalog ID) tuples :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog :param sandbox: whether to perform edits on the `Wikidata sandbox `_ item """ + sandbox_item = vocabulary.SANDBOX_2 catalog_qid = target_database.get_catalog_qid(catalog) person_pid = target_database.get_person_pid(catalog) + heuristic = vocabulary.RECORD_LINKAGE + + if sandbox: + LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) - for subject, predicate, value, person_tid in statements: + for subject, predicate, value, catalog_id in statements: LOGGER.info( - 'Processing (%s, %s, %s) statement', subject, predicate, value + 'Processing (%s, %s, %s, %s) statement ...', + subject, predicate, value, catalog_id + ) + actual_subject = subject if not sandbox else sandbox_item + _add_or_reference( + (actual_subject, predicate, value), + heuristic, + catalog_qid=catalog_qid, + catalog_pid=person_pid, + catalog_id=catalog_id, + edit_summary=LINKS_VALIDATION_SUMMARY ) - if sandbox: - _add_or_reference( - vocabulary.SANDBOX_2, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=URL_VALIDATION_SUMMARY, - ) - else: - _add_or_reference( - subject, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=URL_VALIDATION_SUMMARY, - ) def add_works_statements( @@ -393,32 +382,29 @@ def add_works_statements( :param sandbox: whether to perform edits on the `Wikidata sandbox `_ item """ + sandbox_item = vocabulary.SANDBOX_2 + catalog_qid = target_database.get_catalog_qid(catalog) is_imdb, person_pid = _get_works_args(catalog) + heuristic = vocabulary.RECORD_LINKAGE + + if sandbox: + LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) - for work, predicate, person, person_tid in statements: + for work, predicate, person, person_id in statements: LOGGER.info( - 'Processing (%s, %s, %s) statement', work, predicate, person + 'Processing (%s, %s, %s, %s) statement', + work, predicate, person, person_id + ) + subject = work if not sandbox else sandbox_item + _add_or_reference_works( + (subject, predicate, person), + heuristic, + catalog_qid, + person_pid, + person_id, + is_imdb=is_imdb, + edit_summary=WORKS_SUMMARY ) - if sandbox: - _add_or_reference_works( - vocabulary.SANDBOX_1, - predicate, - person, - person_pid, - person_tid, - is_imdb=is_imdb, - summary=WORKS_SUMMARY, - ) - else: - _add_or_reference_works( - work, - predicate, - person, - person_pid, - person_tid, - is_imdb=is_imdb, - summary=WORKS_SUMMARY, - ) def delete_or_deprecate_identifiers( @@ -459,35 +445,26 @@ def delete_or_deprecate_identifiers( _delete_or_deprecate(action, qid, tid, catalog, catalog_pid) -def _add_or_reference_works( - work: str, - predicate: str, - person: str, - person_pid: str, - person_tid: str, - is_imdb=False, - summary=None, -) -> None: +def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, catalog_pid: str, catalog_id: str, + is_imdb=False, edit_summary=None) -> None: + work, predicate, person = statement # Parse value into an item in case of QID qid = match(QID_REGEX, person) if not qid: LOGGER.warning( - "%s doesn't look like a QID, won't try to add the (%s, %s, %s) statement", - person, - work, - predicate, - person, + "%s doesn't look like a QID, won't try to add the %s statement", + person, statement ) return - person = pywikibot.ItemPage(REPO, qid.group()) + person_item = pywikibot.ItemPage(REPO, qid.group()) subject_item, claims = _essential_checks( - work, - predicate, - person, - person_pid=person_pid, - person_tid=person_tid, - summary=summary, + (work, predicate, person_item), + heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary, ) if None in (subject_item, claims): return @@ -497,12 +474,12 @@ def _add_or_reference_works( for pred in vocabulary.MOVIE_PIDS: if _check_for_same_value( claims, - work, - pred, - person, - person_pid=person_pid, - person_tid=person_tid, - summary=summary, + (work, pred, person_item), + heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary ): return @@ -510,30 +487,24 @@ def _add_or_reference_works( claims, subject_item, predicate, - person, - person_pid=person_pid, - person_tid=person_tid, - summary=summary, + person_item, + heuristic, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary, ) def _add_or_reference( - subject: str, - predicate: str, - value: str, - catalog_qid: str, - person_pid: str, - person_tid: str, - summary=None, + statement, heuristic, + catalog_qid=None, catalog_pid=None, catalog_id=None, + edit_summary=None ) -> None: + subject, predicate, value = statement subject_item, claims = _essential_checks( - subject, - predicate, - value, - catalog_qid, - person_pid=person_pid, - person_tid=person_tid, - summary=summary, + statement, heuristic, + catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, + edit_summary=edit_summary ) if None in (subject_item, claims): @@ -545,11 +516,12 @@ def _add_or_reference( # See https://www.wikidata.org/wiki/User_talk:Jura1#Thanks_for_your_feedback_on_User:Soweego_bot_task_2 if _check_for_same_value( claims, - subject, - vocabulary.OFFICIAL_WEBSITE, - value, - catalog_qid, - summary=summary, + (subject, vocabulary.OFFICIAL_WEBSITE, value,), + heuristic, + edit_summary=edit_summary, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id ): return @@ -565,11 +537,12 @@ def _add_or_reference( subject_item, predicate, value, - catalog_qid, + heuristic, case_insensitive=case_insensitive, - person_pid=person_pid, - person_tid=person_tid, - summary=summary, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary ) @@ -578,11 +551,12 @@ def _handle_addition( subject_item, predicate, value, - catalog_qid, + heuristic, case_insensitive=False, - person_pid=None, - person_tid=None, - summary=None, + catalog_qid=None, + catalog_pid=None, + catalog_id=None, + edit_summary=None, ): given_predicate_claims = claims.get(predicate) subject_qid = subject_item.getID() @@ -591,13 +565,12 @@ def _handle_addition( if not given_predicate_claims: LOGGER.debug('%s has no %s claim', subject_qid, predicate) _add( - subject_item, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=summary, + subject_item, predicate, value, + heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary ) return @@ -620,13 +593,12 @@ def _handle_addition( '%s has no %s claim with value %s', subject_qid, predicate, value ) _add( - subject_item, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=summary, + subject_item, predicate, value, + heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary ) return @@ -637,16 +609,12 @@ def _handle_addition( if case_insensitive: for claim in given_predicate_claims: if claim.getTarget().lower() == value: - _reference( - claim, catalog_qid, person_pid, person_tid, summary=summary - ) + _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary) return for claim in given_predicate_claims: if claim.getTarget() == value: - _reference( - claim, catalog_qid, person_pid, person_tid, summary=summary - ) + _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary) def _handle_redirect_and_dead(qid): @@ -665,14 +633,14 @@ def _handle_redirect_and_dead(qid): def _essential_checks( - subject, - predicate, - value, - catalog_qid, - person_pid=None, - person_tid=None, - summary=None, + statement: tuple, + heuristic: str, + catalog_qid=None, + catalog_pid=None, + catalog_id=None, + edit_summary=None, ): + subject, predicate, value = statement item, data = _handle_redirect_and_dead(subject) if item is None and data is None: @@ -682,13 +650,11 @@ def _essential_checks( if not data: LOGGER.warning('%s has no data at all', subject) _add( - item, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=summary, + item, predicate, value, heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary ) return None, None @@ -697,13 +663,11 @@ def _essential_checks( if not claims: LOGGER.warning('%s has no claims', subject) _add( - item, - predicate, - value, - catalog_qid, - person_pid, - person_tid, - summary=summary, + item, predicate, value, heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary ) return None, None @@ -712,14 +676,14 @@ def _essential_checks( def _check_for_same_value( subject_claims, - subject, - predicate, - value, - catalog_qid, - person_pid=None, - person_tid=None, - summary=None, + statement, + heuristic, + edit_summary=None, + catalog_qid=None, + catalog_pid=None, + catalog_id=None, ): + subject, predicate, value = statement given_predicate_claims = subject_claims.get(predicate) if given_predicate_claims: for claim in given_predicate_claims: @@ -731,7 +695,11 @@ def _check_for_same_value( value, ) _reference( - claim, catalog_qid, person_pid, person_tid, summary=summary + claim, heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary ) return True return False @@ -776,98 +744,79 @@ def _add( subject_item, predicate, value, - catalog_qid, - person_pid, - person_tid, - summary=None, + heuristic, + catalog_qid=None, + catalog_pid=None, + catalog_id=None, + edit_summary=None, ): claim = pywikibot.Claim(REPO, predicate) claim.setTarget(value) - subject_item.addClaim(claim, summary=summary) + subject_item.addClaim(claim, summary=edit_summary) LOGGER.debug('Added claim: %s', claim.toJSON()) - _reference(claim, catalog_qid, person_pid, person_tid, summary=summary) + _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary) LOGGER.info( 'Added (%s, %s, %s) statement', subject_item.getID(), predicate, value ) def _reference( - claim, catalog_qid, person_pid, person_tid, heuristic, - summary=None + claim: pywikibot.Claim, heuristic: str, + catalog_qid=None, catalog_pid=None, catalog_id=None, edit_summary=None ): - # Reference node - # create `pywikibot.Claim` instances at runtime: + reference_node, log_buffer = [], [] + + # Create `pywikibot.Claim` instances at runtime: # pywikibot would cry if the same instances get uploaded multiple times # over the same item - # (based on heuristic, `heuristic`) reference claim: depends on the task + + # Depends on the bot task + # (based on heuristic, `heuristic`) reference claim based_on_heuristic_reference = pywikibot.Claim( REPO, vocabulary.BASED_ON_HEURISTIC, is_reference=True ) based_on_heuristic_reference.setTarget( pywikibot.ItemPage(REPO, heuristic) ) - # (stated in, CATALOG) reference claim - stated_in_reference = pywikibot.Claim( - REPO, vocabulary.STATED_IN, is_reference=True - ) - stated_in_reference.setTarget(pywikibot.ItemPage(REPO, catalog_qid)) + reference_node.append(based_on_heuristic_reference) + log_buffer.append(f'({based_on_heuristic_reference.getID()}, {heuristic})') + + # Validator tasks only + if catalog_qid is not None: + # (stated in, CATALOG) reference claim + stated_in_reference = pywikibot.Claim( + REPO, vocabulary.STATED_IN, is_reference=True + ) + stated_in_reference.setTarget(pywikibot.ItemPage(REPO, catalog_qid)) + reference_node.append(stated_in_reference) + log_buffer.append(f'({stated_in_reference.getID()}, {catalog_qid})') + + if catalog_pid is not None and catalog_id is not None: + # (catalog property, catalog ID) reference claim + catalog_id_reference = pywikibot.Claim(REPO, catalog_pid, is_reference=True) + catalog_id_reference.setTarget(catalog_id) + reference_node.append(catalog_id_reference) + log_buffer.append(f'({catalog_pid}, {catalog_id})') + + # All tasks # (retrieved, TODAY) reference claim retrieved_reference = pywikibot.Claim( REPO, vocabulary.RETRIEVED, is_reference=True ) retrieved_reference.setTarget(TIMESTAMP) + reference_node.append(retrieved_reference) + log_buffer.append(f'({retrieved_reference.getID()}, {TODAY})') - if None in (person_pid, person_tid,): - reference_log = ( - f'({based_on_heuristic_reference.getID()}, {heuristic}), ' - f'({stated_in_reference.getID()}, {catalog_qid}), ' - f'({retrieved_reference.getID()}, {TODAY})' - ) - - try: - claim.addSources( - [ - based_on_heuristic_reference, - stated_in_reference, - retrieved_reference, - ], - summary=summary, - ) + log_msg = ', '.join(log_buffer) - LOGGER.info('Added %s reference node', reference_log) - except (APIError, Error,) as error: - LOGGER.warning( - 'Could not add %s reference node: %s', reference_log, error - ) - else: - # (catalog property, catalog_ID) reference claim - tid_reference = pywikibot.Claim(REPO, person_pid, is_reference=True) - tid_reference.setTarget(person_tid) - - reference_log = ( - f'({based_on_heuristic_reference.getID()}, {heuristic}), ' - f'({stated_in_reference.getID()}, {catalog_qid}), ' - f'({person_pid}, {person_tid}), ' - f'({retrieved_reference.getID()}, {TODAY})' + try: + claim.addSources(reference_node, summary=edit_summary) + LOGGER.info('Added %s reference node', log_msg) + except (APIError, Error,) as error: + LOGGER.warning( + 'Could not add %s reference node: %s', log_msg, error ) - try: - claim.addSources( - [ - based_on_heuristic_reference, - stated_in_reference, - tid_reference, - retrieved_reference, - ], - summary=summary, - ) - - LOGGER.info('Added %s reference node', reference_log) - except (APIError, Error,) as error: - LOGGER.warning( - 'Could not add %s reference node: %s', reference_log, error - ) - def _delete_or_deprecate(action, qid, tid, catalog, catalog_pid) -> None: item, data = _handle_redirect_and_dead(qid) From 41d4d3042c7fa167ac0a4942e378a140ae99ca7b Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Fri, 6 Aug 2021 15:00:06 +0000 Subject: [PATCH 06/22] pass catalog QID to works; improve CLI help; use sandbox 2; simplify code --- soweego/ingester/wikidata_bot.py | 48 +++++++++++++++----------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index 3563251c..f8ac8c80 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -51,18 +51,24 @@ ####################### # Approved task 1: identifiers addition # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot -IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] ' -'with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]' +IDENTIFIERS_SUMMARY = ( + '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] ' + 'with P887 reference, ' + 'see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]' +) # Approved task 2: URLs validation, criterion 2 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_2 -LINKS_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] ' -'with extra P887 and catalog ID reference' +LINKS_VALIDATION_SUMMARY = ( + '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] ' + 'with extra P887 and catalog ID reference' +) # Approved task 3: works by people # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_3 WORKS_SUMMARY = ( - '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_3|bot task 3]]' + '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_3|bot task 3]] ' + 'with extra P887 reference' ) # Biographical data validation, criterion 3 @@ -122,7 +128,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox): '-s', '--sandbox', is_flag=True, - help='Perform all edits on the Wikidata sandbox item Q4115189.', + help='Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) def deprecate_cli(catalog, entity, invalid_identifiers, sandbox): """Deprecate invalid identifiers. @@ -148,7 +154,7 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox): '-s', '--sandbox', is_flag=True, - help='Perform all edits on the Wikidata sandbox item Q4115189.', + help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) def identifiers_cli(catalog, entity, identifiers, sandbox): """Add identifiers. @@ -168,12 +174,8 @@ def identifiers_cli(catalog, entity, identifiers, sandbox): claim (Richard Hell, Discogs artist ID, 266995) - reference (based on heuristic, artificial intelligence), - (retrieved, today) + reference (based on heuristic, artificial intelligence), (retrieved, today) """ - if sandbox: - LOGGER.info('Running on the Wikidata sandbox item ...') - add_identifiers(json.load(identifiers), catalog, entity, sandbox) @@ -211,10 +213,7 @@ def people_cli(catalog, statements, criterion, sandbox): claim (Joey Ramone, member of, Ramones) - reference (based on heuristic, record linkage), - (stated in, Discogs), - (Discogs artist ID, 264375), - (retrieved, today) + reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 264375), (retrieved, today) """ sandbox_item = vocabulary.SANDBOX_2 # See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005) @@ -274,8 +273,7 @@ def works_cli(catalog, statements, sandbox): claim (C'mon Everybody, performer, Eddie Cochran) - reference (based on heuristic, record linkage), - (Discogs artist ID, 139984), (retrieved, today) + reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 139984), (retrieved, today) """ sandbox_item = vocabulary.SANDBOX_2 catalog_qid = target_database.get_catalog_qid(catalog) @@ -308,7 +306,7 @@ def add_identifiers( 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param sandbox: whether to perform edits on the - `Wikidata sandbox `_ item + `Wikidata sandbox 2 `_ item """ sandbox_item = vocabulary.SANDBOX_2 catalog_pid = target_database.get_catalog_pid(catalog, entity) @@ -428,21 +426,18 @@ def delete_or_deprecate_identifiers( A supported entity :param invalid: a ``{invalid_catalog_identifier: [list of QIDs]}`` dictionary :param sandbox: whether to perform edits on the - `Wikidata sandbox `_ item + `Wikidata sandbox 2 `_ item """ + sandbox_item = vocabulary.SANDBOX_2 catalog_pid = target_database.get_catalog_pid(catalog, entity) for tid, qids in invalid.items(): for qid in qids: + actual_qid = qid if not sandbox else sandbox_item LOGGER.info( 'Will %s %s identifier: %s -> %s', action, catalog, tid, qid ) - if sandbox: - _delete_or_deprecate( - action, vocabulary.SANDBOX_1, tid, catalog, catalog_pid - ) - else: - _delete_or_deprecate(action, qid, tid, catalog, catalog_pid) + _delete_or_deprecate(action, actual_qid, tid, catalog, catalog_pid) def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, catalog_pid: str, catalog_id: str, @@ -489,6 +484,7 @@ def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, predicate, person_item, heuristic, + catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, edit_summary=edit_summary, From cb814da21e2f1be831029f4324a32dbf97c3a70d Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Mon, 9 Aug 2021 16:21:23 +0200 Subject: [PATCH 07/22] handle edit summary in public function; use sandbox item 2 everywhere & update docstrings --- soweego/ingester/wikidata_bot.py | 34 +++++++++++++++++++++++--------- soweego/linker/baseline.py | 4 +++- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index f8ac8c80..882a9b87 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -102,7 +102,7 @@ '-s', '--sandbox', is_flag=True, - help='Perform all edits on the Wikidata sandbox item Q4115189.', + help=f'Perform all edits on the Wikidata sandbox 2 item {vocabulary.SANDBOX_2}.', ) def delete_cli(catalog, entity, invalid_identifiers, sandbox): """Delete invalid identifiers. @@ -111,7 +111,10 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox): Format: { catalog_identifier: [ list of QIDs ] } """ if sandbox: - LOGGER.info('Running on the Wikidata sandbox item ...') + LOGGER.info( + 'Running on the Wikidata sandbox item %s ...', + vocabulary.SANDBOX_2 + ) delete_or_deprecate_identifiers( 'delete', catalog, entity, json.load(invalid_identifiers), sandbox @@ -128,7 +131,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox): '-s', '--sandbox', is_flag=True, - help='Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', + help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) def deprecate_cli(catalog, entity, invalid_identifiers, sandbox): """Deprecate invalid identifiers. @@ -137,7 +140,10 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox): Format: { catalog_identifier: [ list of QIDs ] } """ if sandbox: - LOGGER.info('Running on the Wikidata sandbox item ...') + LOGGER.info( + 'Running on the Wikidata sandbox item %s ...', + vocabulary.SANDBOX_2 + ) delete_or_deprecate_identifiers( 'deprecate', catalog, entity, json.load(invalid_identifiers), sandbox @@ -324,9 +330,8 @@ def add_identifiers( edit_summary=IDENTIFIERS_SUMMARY) -# TODO handle edit summary def add_people_statements( - catalog: str, statements: Iterable, sandbox: bool + catalog: str, statements: Iterable, criterion: str, sandbox: bool ) -> None: """Add statements to existing Wikidata people. @@ -334,13 +339,24 @@ def add_people_statements( as per :func:`soweego.validator.checks.links` and :func:`soweego.validator.checks.bio`. - :param statements: iterable of - (subject, predicate, value, catalog ID) tuples :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``. A supported catalog + :param statements: iterable of + (subject, predicate, value, catalog ID) tuples + :param criterion: ``{'links', 'bio'}``. A supported validation criterion :param sandbox: whether to perform edits on the `Wikidata sandbox `_ item """ + if criterion == 'links': + edit_summary = LINKS_VALIDATION_SUMMARY + elif criterion == 'bio': + edit_summary = BIO_VALIDATION_SUMMARY + else: + raise ValueError( + f"Invalid criterion: '{criterion}'. " + "Please use either 'links' or 'bio'" + ) + sandbox_item = vocabulary.SANDBOX_2 catalog_qid = target_database.get_catalog_qid(catalog) person_pid = target_database.get_person_pid(catalog) @@ -361,7 +377,7 @@ def add_people_statements( catalog_qid=catalog_qid, catalog_pid=person_pid, catalog_id=catalog_id, - edit_summary=LINKS_VALIDATION_SUMMARY + edit_summary=edit_summary ) diff --git a/soweego/linker/baseline.py b/soweego/linker/baseline.py index f1ced1fb..3ae0ab44 100644 --- a/soweego/linker/baseline.py +++ b/soweego/linker/baseline.py @@ -272,7 +272,9 @@ def _handle_result( to_upload.add(statement) if upload: - wikidata_bot.add_people_statements(to_upload, sandbox) + wikidata_bot.add_people_statements( + catalog, to_upload, 'links', sandbox + ) LOGGER.info('%s %s dumped to %s', catalog, origin, path_out) From 46b8080d0b978e008ad931acb80e1b43e3e6f4ed Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Mon, 9 Aug 2021 16:23:36 +0200 Subject: [PATCH 08/22] simpler although a bit more redundant WD upload code; update docstrings to use sandbox item 2 --- soweego/validator/checks.py | 76 ++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index ae0a4813..0206c17e 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -5,17 +5,16 @@ __author__ = 'Marco Fossati' __email__ = 'fossati@spaziodati.eu' -__version__ = '1.0' +__version__ = '2.0' __license__ = 'GPL-3.0' -__copyright__ = 'Copyleft 2018, Hjfocs' +__copyright__ = 'Copyleft 2021, Hjfocs' import csv import json import logging import os from collections import defaultdict -from itertools import zip_longest -from typing import DefaultDict, Dict, Iterator, List, Tuple +from typing import DefaultDict, Dict, Iterator, Tuple, Union import click from sqlalchemy.exc import SQLAlchemyError @@ -59,7 +58,7 @@ '-s', '--sandbox', is_flag=True, - help='Perform all deprecations on the Wikidata sandbox item Q4115189.', + help=f'Perform all deprecations on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) @click.option( '--dump-wikidata', @@ -114,7 +113,10 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): # Deprecate dead ids in Wikidata if deprecate: - _upload_result(catalog, entity, dead, None, None, sandbox) + LOGGER.info('Starting deprecation of %s IDs ...', catalog) + wikidata_bot.delete_or_deprecate_identifiers( + 'deprecate', catalog, entity, dead, sandbox + ) @click.command() @@ -137,7 +139,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): '-s', '--sandbox', is_flag=True, - help='Perform all edits on the Wikidata sandbox item Q4115189.', + help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) @click.option( '--dump-wikidata', @@ -189,13 +191,13 @@ def links_cli( with open(wd_links_path) as wdin: wd_links = _load_wd_cache(wdin) # Discard the last return value: Wikidata cache - ids_to_be_deprecated, ids_to_be_added, urls_to_be_added, _ = links( + ids_to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, _ = links( catalog, entity, blacklist, wd_cache=wd_links ) else: ( ids_to_be_deprecated, - ids_to_be_added, + ext_ids_to_be_added, urls_to_be_added, wd_links, ) = links(catalog, entity, blacklist) @@ -211,18 +213,23 @@ def links_cli( # Dump output files _dump_deprecated(ids_to_be_deprecated, deprecated_path) - _dump_csv_output(ids_to_be_added, ids_path, 'third-party IDs') + _dump_csv_output(ext_ids_to_be_added, ids_path, 'third-party IDs') _dump_csv_output(urls_to_be_added, urls_path, 'URLs') # Upload the output to Wikidata if upload: - _upload_result( - catalog, - entity, - ids_to_be_deprecated, - urls_to_be_added, - ids_to_be_added, - sandbox, + criterion = 'links' + LOGGER.info('Starting deprecation of %s IDs ...', catalog) + wikidata_bot.delete_or_deprecate_identifiers( + 'deprecate', catalog, entity, ids_to_be_deprecated, sandbox + ) + LOGGER.info('Starting addition of external IDs to Wikidata ...') + wikidata_bot.add_people_statements( + catalog, ext_ids_to_be_added, criterion, sandbox + ) + LOGGER.info('Starting addition of statements to Wikidata ...') + wikidata_bot.add_people_statements( + catalog, urls_to_be_added, criterion, sandbox ) @@ -240,7 +247,7 @@ def links_cli( '-s', '--sandbox', is_flag=True, - help='Perform all edits on the Wikidata sandbox item Q4115189.', + help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) @click.option( '--dump-wikidata', @@ -302,7 +309,15 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): # Upload the output to Wikidata if upload: - _upload(catalog, entity, to_be_deprecated, to_be_added, sandbox) + criterion = 'bio' + LOGGER.info('Starting deprecation of %s IDs ...', catalog) + wikidata_bot.delete_or_deprecate_identifiers( + 'deprecate', catalog, entity, to_be_deprecated, sandbox + ) + LOGGER.info('Starting addition of statements to Wikidata ...') + wikidata_bot.add_people_statements( + catalog, to_be_added, criterion, sandbox + ) def dead_ids( @@ -386,7 +401,7 @@ def dead_ids( def links( catalog: str, entity: str, url_blacklist=False, wd_cache=None -) -> Tuple[DefaultDict, List, List, Dict]: +) -> Union[Tuple[defaultdict, list, list, dict], Tuple[None, None, None, None]]: """Validate identifiers against available links. Also generate statements based on additional links @@ -477,7 +492,7 @@ def links( def bio( catalog: str, entity: str, wd_cache=None -) -> Tuple[DefaultDict, Iterator, Dict]: +) -> Union[Tuple[defaultdict, Iterator, dict], Tuple[None, None, None]]: """Validate identifiers against available biographical data. Look for: @@ -566,7 +581,7 @@ def _apply_url_blacklist(url_statements): def _bio_to_be_added_generator(to_be_added): for (qid, tid,), values in to_be_added.items(): for pid, value in values: - yield (qid, pid, value, tid,) + yield qid, pid, value, tid def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): @@ -751,23 +766,6 @@ def _match_dates_by_precision( return shared, extra -def _upload_result( - catalog, entity, to_deprecate, urls_to_add, ext_ids_to_add, sandbox -): - _upload(catalog, entity, to_deprecate, urls_to_add, sandbox) - LOGGER.info('Starting addition of external IDs to Wikidata ...') - wikidata_bot.add_people_statements(catalog, ext_ids_to_add, sandbox) - - -def _upload(catalog, entity, to_deprecate, to_add, sandbox): - LOGGER.info('Starting deprecation of %s IDs ...', catalog) - wikidata_bot.delete_or_deprecate_identifiers( - 'deprecate', catalog, entity, to_deprecate, sandbox - ) - LOGGER.info('Starting addition of statements to Wikidata ...') - wikidata_bot.add_people_statements(catalog, to_add, sandbox) - - def _dump_deprecated(data, outpath): if data: with open(outpath, 'w') as deprecated: From df82da1f603258ebcc542187b4cd04d6865e77e0 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Mon, 9 Aug 2021 15:33:53 +0000 Subject: [PATCH 09/22] remove 2 --- soweego/ingester/wikidata_bot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index 882a9b87..5d2a6bc7 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -102,7 +102,7 @@ '-s', '--sandbox', is_flag=True, - help=f'Perform all edits on the Wikidata sandbox 2 item {vocabulary.SANDBOX_2}.', + help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.', ) def delete_cli(catalog, entity, invalid_identifiers, sandbox): """Delete invalid identifiers. From b37bad11ee6ff84e9a991862ad57b402b4c725e0 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Mon, 9 Aug 2021 15:34:26 +0000 Subject: [PATCH 10/22] log an info msg when uploading to sandbox --- soweego/validator/checks.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 0206c17e..314e6a50 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -218,6 +218,11 @@ def links_cli( # Upload the output to Wikidata if upload: + if sandbox: + LOGGER.info( + 'Running on the Wikidata sandbox item %s ...', + vocabulary.SANDBOX_2 + ) criterion = 'links' LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers( @@ -309,6 +314,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): # Upload the output to Wikidata if upload: + if sandbox: + LOGGER.info( + 'Running on the Wikidata sandbox item %s ...', + vocabulary.SANDBOX_2 + ) criterion = 'bio' LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers( From d3427d0a651051ea9d39a7b7b92594d0b1e87ee7 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Tue, 10 Aug 2021 10:42:48 +0000 Subject: [PATCH 11/22] keep track of matching dates to avoid incorrect comparisons, closes #412 ; return extra dates in 'ISO_date/precision' format --- soweego/validator/checks.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 314e6a50..46bdeb73 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -703,9 +703,14 @@ def _compare_dates(wd, target): # `target` has '1986-01-01/9' # `shared_dates` will have one element shared_dates, extra_dates = set(), set() + wd_matches, target_matches = [], [] + + for i, wd_elem in enumerate(wd): + for j, t_elem in enumerate(target): + # Don't compare when already matched + if i in wd_matches or j in target_matches: + continue - for wd_elem in wd: - for t_elem in target: wd_pid, wd_val = wd_elem t_pid, t_val = t_elem @@ -734,9 +739,14 @@ def _compare_dates(wd, target): if shared_date is not None: shared_dates.add(shared_date) - if extra_date is not None: + # Keep track of matches to avoid useless computation + # and incorrect comparisons: + # this happens when WD has multiple claims with + # the same property + wd_matches.append(i) + target_matches.append(j) + elif extra_date is not None: extra_dates.add(extra_date) - return shared_dates, extra_dates @@ -770,9 +780,7 @@ def _match_dates_by_precision( shared = wd_elem else: LOGGER.debug('Target has an extra date: %s', t_timestamp) - # Output dates in ISO format - # t_elem[0] is the PID - extra = (t_elem[0], t_timestamp) + extra = t_elem return shared, extra From 2a90b0fbc366950eee856870ce1d029d596e7f17 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Tue, 10 Aug 2021 10:44:17 +0000 Subject: [PATCH 12/22] expect date strings in 'ISO-date/precision' format: avoid 01-01 precision hack --- soweego/ingester/wikidata_bot.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index 5d2a6bc7..30ad56f1 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -727,18 +727,14 @@ def _parse_value(value): return pywikibot.ItemPage(REPO, value_is_qid.group()) # Try to build a date try: - date_value = date.fromisoformat(value) - # Precision hack: it's a year if both month and day are 1 - precision = ( - vocabulary.YEAR - if date_value.month == 1 and date_value.day == 1 - else vocabulary.DAY - ) + # A date should be in the form '1984-11-16/11' + date_str, precision = value.split('/') + date_obj = date.fromisoformat(date_str) return pywikibot.WbTime( - date_value.year, - date_value.month, - date_value.day, - precision=precision, + date_obj.year, + date_obj.month, + date_obj.day, + precision=int(precision), ) # Otherwise return the value as is except ValueError: From 5fda9e8fc3fe5ebbe86c780e282fda7a434f6561 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Tue, 10 Aug 2021 16:38:23 +0000 Subject: [PATCH 13/22] [WIP] start work on issue #413 --- soweego/commons/data_gathering.py | 35 +++++++++++++++++-------------- soweego/validator/checks.py | 15 ++++++++++++- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py index c14709aa..86a150ee 100644 --- a/soweego/commons/data_gathering.py +++ b/soweego/commons/data_gathering.py @@ -20,7 +20,7 @@ from sqlalchemy import or_ from tqdm import tqdm -from soweego.commons import constants, keys, target_database, url_utils +from soweego.commons import constants, keys, target_database, text_utils, url_utils from soweego.commons.db_manager import DBManager from soweego.importer import models from soweego.wikidata import api_requests, sparql_queries, vocabulary @@ -382,27 +382,30 @@ def gather_wikidata_biodata(wikidata): for qid, pid, value in api_requests.get_biodata(wikidata.keys()): parsed = api_requests.parse_value(value) if not wikidata[qid].get(keys.BIODATA): - wikidata[qid][keys.BIODATA] = set() - # `parsed` is a set of labels if the value is a QID - # see api_requests.parse_value + wikidata[qid][keys.BIODATA] = [] + # If `parsed` is a set, we have item labels, + # see `api_requests.parse_value` behavior if isinstance(parsed, set): - # The English label for gender should be enough - gender = parsed & {keys.MALE, keys.FEMALE} - if gender: - wikidata[qid][keys.BIODATA].add((pid, gender.pop())) - else: - # Add a (pid, label) tuple for each element - # for better recall - for element in parsed: - wikidata[qid][keys.BIODATA].add((pid, element)) - # `parsed` is a tuple (timestamp, precision) id the value is a date + # Keep track of the value QID + # Dict key checks are already done in `api_requests.parse_value`, + # so no need to redo it here + v_qid = value['id'] + # Normalize & de-duplicate labels + # `text_utils.normalize` returns a tuple with two forms + # (non-lower, lower): take the lowercased one + labels = {text_utils.normalize(label)[1] for label in parsed} + # e.g., (P19, Q641, {'venezia', 'venice', ...}) + wikidata[qid][keys.BIODATA].append( + (pid, v_qid, labels) + ) + # If `parsed` is a tuple, we have a (timestamp, precision) date elif isinstance(parsed, tuple): timestamp, precision = parsed[0], parsed[1] # Get rid of time, useless timestamp = timestamp.split('T')[0] - wikidata[qid][keys.BIODATA].add((pid, f'{timestamp}/{precision}')) + wikidata[qid][keys.BIODATA].append((pid, f'{timestamp}/{precision}')) else: - wikidata[qid][keys.BIODATA].add((pid, parsed)) + wikidata[qid][keys.BIODATA].append((pid, parsed)) total += 1 LOGGER.info('Got %d statements', total) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 46bdeb73..279ce135 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -678,6 +678,8 @@ def _compute_shared_and_extra(criterion, wd_data, target_data): wd_dates, wd_other = _extract_dates(wd_data) target_dates, target_other = _extract_dates(target_data) shared_dates, extra_dates = _compare_dates(wd_dates, target_dates) + import ipdb; ipdb.set_trace() + # FIXME data model has changed: _compare_others shared = wd_other.intersection(target_other).union(shared_dates) extra = target_other.difference(wd_other).union(extra_dates) else: @@ -686,6 +688,14 @@ def _compute_shared_and_extra(criterion, wd_data, target_data): return shared, extra +def _compare_others(wd, target): + shared, extra = set(), set() + wd_matches, target_matches = [], [] + + for i, wd_elem in enumerate(wd): + for j, t_elem in enumerate(target): + + def _extract_dates(data): dates = set() @@ -697,6 +707,7 @@ def _extract_dates(data): def _compare_dates(wd, target): +def _compare(what, wd, target): # Ensure unique comparisons, regardless of different precisions. # For instance: # `wd` has '1986-01-01/9' and '1986-11-29/11' @@ -721,11 +732,13 @@ def _compare_dates(wd, target): # Skip unexpected `None` values if None in (wd_val, t_val): LOGGER.warning( - 'Skipping unexpected %s date pair with missing value(s)', + 'Skipping unexpected %s pair with missing value(s)', (wd_elem, t_elem), ) continue + if what == 'dates': + # FIXME extract function wd_timestamp, wd_precision = wd_val.split('/') t_timestamp, t_precision = t_val.split('/') From 00b52b7e782c2c835511bdc33e146f028af8ebbc Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Tue, 10 Aug 2021 21:12:57 +0200 Subject: [PATCH 14/22] [WIP] comparison logic for values other than dates; refactor comparison --- soweego/validator/checks.py | 125 +++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 53 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 279ce135..adf3023b 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -19,7 +19,7 @@ import click from sqlalchemy.exc import SQLAlchemyError -from soweego.commons import constants, data_gathering, keys, target_database +from soweego.commons import constants, data_gathering, keys, target_database, text_utils from soweego.commons.db_manager import DBManager from soweego.ingester import wikidata_bot from soweego.wikidata import vocabulary @@ -673,29 +673,26 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): def _compute_shared_and_extra(criterion, wd_data, target_data): - # Properly compare dates when checking biographical data - if criterion == keys.BIODATA: + if criterion == keys.LINKS: + shared = wd_data.intersection(target_data) + extra = target_data.difference(wd_data) + # Biographical validation requires more complex comparisons + elif criterion == keys.BIODATA: wd_dates, wd_other = _extract_dates(wd_data) target_dates, target_other = _extract_dates(target_data) - shared_dates, extra_dates = _compare_dates(wd_dates, target_dates) + shared_dates, extra_dates = _compare('dates', wd_dates, target_dates) import ipdb; ipdb.set_trace() - # FIXME data model has changed: _compare_others - shared = wd_other.intersection(target_other).union(shared_dates) - extra = target_other.difference(wd_other).union(extra_dates) + shared_other, extra_other = _compare('other', wd_other, target_other) + shared = shared_dates | shared_other + extra = extra_dates | extra_other else: - shared = wd_data.intersection(target_data) - extra = target_data.difference(wd_data) + raise ValueError( + f"Invalid validation criterion: '{criterion}'. " + f"Please use either '{keys.LINKS}' or '{keys.BIODATA}'" + ) return shared, extra -def _compare_others(wd, target): - shared, extra = set(), set() - wd_matches, target_matches = [], [] - - for i, wd_elem in enumerate(wd): - for j, t_elem in enumerate(target): - - def _extract_dates(data): dates = set() @@ -706,14 +703,12 @@ def _extract_dates(data): return dates, data.difference(dates) -def _compare_dates(wd, target): def _compare(what, wd, target): - # Ensure unique comparisons, regardless of different precisions. - # For instance: - # `wd` has '1986-01-01/9' and '1986-11-29/11' - # `target` has '1986-01-01/9' - # `shared_dates` will have one element - shared_dates, extra_dates = set(), set() + shared, extra = set(), set() + # Keep track of matches to avoid useless computation + # and incorrect comparisons: + # this happens when WD has multiple claims with + # the same property wd_matches, target_matches = [], [] for i, wd_elem in enumerate(wd): @@ -722,45 +717,68 @@ def _compare(what, wd, target): if i in wd_matches or j in target_matches: continue - wd_pid, wd_val = wd_elem - t_pid, t_val = t_elem - - # Don't compare birth with death dates - if wd_pid != t_pid: + # Don't compare different PIDs + if wd_elem[0] != t_elem[0]: continue # Skip unexpected `None` values - if None in (wd_val, t_val): + if None in (wd_elem[1], t_elem[1]): LOGGER.warning( 'Skipping unexpected %s pair with missing value(s)', (wd_elem, t_elem), ) continue - if what == 'dates': - # FIXME extract function - wd_timestamp, wd_precision = wd_val.split('/') - t_timestamp, t_precision = t_val.split('/') - - shared_date, extra_date = _match_dates_by_precision( - min(int(wd_precision), int(t_precision)), - wd_elem, - wd_timestamp, - t_elem, - t_timestamp, + inputs = ( + shared, extra, wd_matches, target_matches, + i, wd_elem, j, t_elem ) + if what == 'dates': + _compare_dates(inputs) + elif what == 'other': + _compare_other(inputs) + else: + raise ValueError( + f"Invalid argument: '{what}'. " + "Please use either 'dates' or 'other'" + ) + + return shared, extra + - if shared_date is not None: - shared_dates.add(shared_date) - # Keep track of matches to avoid useless computation - # and incorrect comparisons: - # this happens when WD has multiple claims with - # the same property - wd_matches.append(i) - target_matches.append(j) - elif extra_date is not None: - extra_dates.add(extra_date) - return shared_dates, extra_dates +def _compare_other(inputs): + shared, extra, wd_matches, target_matches, i, wd_elem, j, t_elem = inputs + pid, qid, wd_values = wd_elem + _, t_value = t_elem + + # TODO improve matching + if text_utils.normalize(t_value) in wd_values: + shared.add((pid, qid)) + wd_matches.append(i) + target_matches.append(j) + else: + # TODO resolve target string into QID + extra.add((pid, t_value)) + + +def _compare_dates(inputs): + shared, extra, wd_matches, target_matches, i, wd_elem, j, t_elem = inputs + + wd_timestamp, wd_precision = wd_elem[1].split('/') + t_timestamp, t_precision = t_elem[1].split('/') + shared_date, extra_date = _match_dates_by_precision( + min(int(wd_precision), int(t_precision)), + wd_elem, + wd_timestamp, + t_elem, + t_timestamp, + ) + if shared_date is not None: + shared.add(shared_date) + wd_matches.append(i) + target_matches.append(j) + elif extra_date is not None: + extra.add(extra_date) def _match_dates_by_precision( @@ -820,6 +838,7 @@ def _dump_csv_output(data, outpath, log_msg_subject): LOGGER.info("No %s to be added, won't dump to file", log_msg_subject) +# FIXME adapt to new data model def _load_wd_cache(file_handle): raw_cache = json.load(file_handle) cache = {} @@ -853,7 +872,7 @@ def _dump_wd_cache(cache, outpath): json.dump( { qid: { - data_type: list(values) + data_type: values for data_type, values in data.items() } for qid, data in cache.items() From 93bc7f4c20a974df78ebb78a70b9e3464a78d734 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Thu, 12 Aug 2021 10:41:41 +0000 Subject: [PATCH 15/22] feeling-lucky resolution of a QID given a term, closes #414 --- soweego/wikidata/api_requests.py | 47 ++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py index 61cb7a99..9cdff417 100644 --- a/soweego/wikidata/api_requests.py +++ b/soweego/wikidata/api_requests.py @@ -25,9 +25,9 @@ __author__ = 'Marco Fossati' __email__ = 'fossati@spaziodati.eu' -__version__ = '1.0' +__version__ = '2.0' __license__ = 'GPL-3.0' -__copyright__ = 'Copyleft 2018, Hjfocs' +__copyright__ = 'Copyleft 2021, Hjfocs' LOGGER = logging.getLogger(__name__) @@ -39,8 +39,45 @@ BUCKET_SIZE = 500 +def resolve_qid(term: str, language='en') -> str: + """Try to resolve a QID given a search term, in a *feeling lucky* way. + + :param term: a search term + :param language: (optional) search in the given language code. + Default: ``en``. + :return: the QID of the first result + """ + params = { + 'action': 'wbsearchentities', + 'format': 'json', + 'search': term, + 'language': language + } + response_body = _make_request(params) + + # Failed API request + if response_body is None: + return None + + try: + return response_body['search'][0]['id'] + # Malformed JSON response + except KeyError as e: + LOGGER.error( + "Missing '%s' key from JSON response: %s", e, response_body + ) + return None + # No search results + except IndexError: + LOGGER.info( + "No QIDs found for search term '%s' (language: %s)", + term, language + ) + return None + + def get_url_blacklist() -> set: - """Retrieve a blacklist with URL domains of low-quality sources + """Retrieve a blacklist with URL domains of low-quality sources. :return: the set of blacklisted domains """ @@ -56,12 +93,12 @@ def get_url_blacklist() -> set: if response_body is None: return None - # Handle malformed JSON response + # Malformed JSON response try: star = response_body['parse']['text']['*'] # Interesting nonsense key except KeyError as e: LOGGER.error( - "Missing key %s from JSON response: %s", e, response_body, + "Missing '%s' key from JSON response: %s", e, response_body ) return None From 5072c78f514926543ba23644c19c9457f9c0064e Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Thu, 12 Aug 2021 10:54:06 +0000 Subject: [PATCH 16/22] closes #413 ; closes #417 ; see #414 Fix bio values comparison; dump shared statements; feeling-lucky QID resolution; pickle Wikidata cache, instead of JSON dump; simplify bio comparison; revisit & rename variables. --- soweego/validator/checks.py | 286 ++++++++++++++++++------------------ 1 file changed, 146 insertions(+), 140 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index adf3023b..88011978 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -13,6 +13,7 @@ import json import logging import os +import pickle from collections import defaultdict from typing import DefaultDict, Dict, Iterator, Tuple, Union @@ -27,18 +28,18 @@ LOGGER = logging.getLogger(__name__) -# For dead_ids_cli -DEAD_IDS_FILENAME = '{}_{}_dead_identifiers.json' -WD_IDS_FILENAME = '{}_{}_identifiers_in_wikidata.json' -# For links_cli -LINKS_IDS_TO_BE_DEPRECATED_FILENAME = '{}_{}_identifiers_to_be_deprecated.json' -EXTRA_IDS_TO_BE_ADDED_FILENAME = '{}_{}_third_party_identifiers_to_be_added.csv' -URLS_TO_BE_ADDED_FILENAME = '{}_{}_urls_to_be_added.csv' -WD_LINKS_FILENAME = '{}_{}_urls_in_wikidata.json' -# For bio_cli -BIO_IDS_TO_BE_DEPRECATED_FILENAME = '{}_{}_identifiers_to_be_deprecated.json' -BIO_STATEMENTS_TO_BE_ADDED_FILENAME = '{}_{}_bio_statements_to_be_added.csv' -WD_BIO_FILENAME = '{}_{}_bio_data_in_wikidata.json' +# File name templates +# For all CLIs +WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl' +IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json' +SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv' +# For `dead_ids_cli` +DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json' +# For `links_cli` +EXT_IDS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_external_ids_to_be_added.csv' +URLS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_urls_to_be_added.csv' +# For `bio_cli` +BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv' @click.command() @@ -63,7 +64,7 @@ @click.option( '--dump-wikidata', is_flag=True, - help='Dump identifiers gathered from Wikidata to a JSON file.', + help='Dump identifiers gathered from Wikidata to a Python pickle.', ) @click.option( '--dir-io', @@ -80,24 +81,30 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): you can pass the '-d' flag to do so. """ dead_ids_path = os.path.join( - dir_io, DEAD_IDS_FILENAME.format(catalog, entity) + dir_io, DEAD_IDS_FNAME.format(catalog=catalog, entity=entity) + ) + wd_cache_path = os.path.join( + dir_io, WD_CACHE_FNAME.format( + catalog=catalog, entity=entity, criterion='dead_ids' + ) ) - wd_ids_path = os.path.join(dir_io, WD_IDS_FILENAME.format(catalog, entity)) # Handle Wikidata cache - if os.path.isfile(wd_ids_path): - with open(wd_ids_path) as wdin: - wd_ids = _load_wd_cache(wdin) + if os.path.isfile(wd_cache_path): + with open(wd_cache_path, 'rb') as cin: + wd_cache = pickle.load(cin) + LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the second return value: Wikidata cache - dead, _ = dead_ids(catalog, entity, wd_cache=wd_ids) + dead, _ = dead_ids(catalog, entity, wd_cache=wd_cache) else: - dead, wd_ids = dead_ids(catalog, entity) + dead, wd_cache = dead_ids(catalog, entity) - # Dump ids gathered from Wikidata + # Dump Wikidata cache if dump_wikidata: - _dump_wd_cache(wd_ids, wd_ids_path) + with open(wd_cache_path, 'wb') as cout: + pickle.dump(wd_cache, cout) LOGGER.info( - 'Identifiers gathered from Wikidata dumped to %s', wd_ids_path + 'Identifiers gathered from Wikidata dumped to %s', wd_cache_path ) # Dump dead ids @@ -144,7 +151,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): @click.option( '--dump-wikidata', is_flag=True, - help='Dump URLs gathered from Wikidata to a JSON file.', + help='Dump URLs gathered from Wikidata to a Python pickle.', ) @click.option( '--dir-io', @@ -152,6 +159,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): default=constants.SHARED_FOLDER, help=f'Input/output directory, default: {constants.SHARED_FOLDER}.', ) +# TODO adapt to also dump shared links def links_cli( catalog, entity, blacklist, upload, sandbox, dump_wikidata, dir_io ): @@ -172,34 +180,44 @@ def links_cli( The '-b' flag applies a URL blacklist of low-quality Web domains. """ + criterion = 'links' # Output paths - deprecated_path = os.path.join( - dir_io, LINKS_IDS_TO_BE_DEPRECATED_FILENAME.format(catalog, entity) + deprecate_path = os.path.join( + dir_io, IDS_TO_BE_DEPRECATED_FNAME.format( + catalog=catalog, entity=entity, criterion=criterion + ) ) ids_path = os.path.join( - dir_io, EXTRA_IDS_TO_BE_ADDED_FILENAME.format(catalog, entity) + dir_io, EXT_IDS_TO_BE_ADDED_FNAME.format( + catalog=catalog, entity=entity + ) ) urls_path = os.path.join( - dir_io, URLS_TO_BE_ADDED_FILENAME.format(catalog, entity) + dir_io, URLS_TO_BE_ADDED_FNAME.format( + catalog=catalog, entity=entity + ) ) - wd_links_path = os.path.join( - dir_io, WD_LINKS_FILENAME.format(catalog, entity) + wd_cache_path = os.path.join( + dir_io, WD_CACHE_FNAME.format( + catalog=catalog, entity=entity, criterion=criterion + ) ) # Handle Wikidata cache - if os.path.isfile(wd_links_path): - with open(wd_links_path) as wdin: - wd_links = _load_wd_cache(wdin) + if os.path.isfile(wd_cache_path): + with open(wd_cache_path, 'rb') as cin: + wd_cache = pickle.load(cin) + LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache ids_to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, _ = links( - catalog, entity, blacklist, wd_cache=wd_links + catalog, entity, blacklist, wd_cache=wd_cache ) else: ( ids_to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, - wd_links, + wd_cache, ) = links(catalog, entity, blacklist) # Nothing to do: the catalog doesn't contain links @@ -208,11 +226,12 @@ def links_cli( # Dump Wikidata cache if dump_wikidata: - _dump_wd_cache(wd_links, wd_links_path) - LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_links_path) + with open(wd_cache_path, 'wb') as cout: + pickle.dump(wd_cache, cout) + LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path) # Dump output files - _dump_deprecated(ids_to_be_deprecated, deprecated_path) + _dump_deprecated(ids_to_be_deprecated, deprecate_path) _dump_csv_output(ext_ids_to_be_added, ids_path, 'third-party IDs') _dump_csv_output(urls_to_be_added, urls_path, 'URLs') @@ -223,7 +242,6 @@ def links_cli( 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2 ) - criterion = 'links' LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers( 'deprecate', catalog, entity, ids_to_be_deprecated, sandbox @@ -257,7 +275,7 @@ def links_cli( @click.option( '--dump-wikidata', is_flag=True, - help='Dump biographical data gathered from Wikidata to a JSON file.', + help='Dump biographical data gathered from Wikidata to a Python pickle.', ) @click.option( '--dir-io', @@ -270,47 +288,68 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): Look for birth/death dates, birth/death places, gender. - Dump 2 output files: + Dump 3 output files: 1. catalog IDs to be deprecated. JSON format: {catalog_ID: [list of QIDs]} - 2. statements to be added. CSV format: + 2. shared statements. CSV format: + QID,PID,value,catalog_ID + + 3. statements to be added. CSV format: QID,PID,value,catalog_ID You can pass the '-u' flag to upload the output to Wikidata. """ - deprecated_path = os.path.join( - dir_io, BIO_IDS_TO_BE_DEPRECATED_FILENAME.format(catalog, entity) + criterion = 'bio' + deprecate_path = os.path.join( + dir_io, IDS_TO_BE_DEPRECATED_FNAME.format( + catalog=catalog, entity=entity, criterion=criterion + ) ) - statements_path = os.path.join( - dir_io, BIO_STATEMENTS_TO_BE_ADDED_FILENAME.format(catalog, entity) + shared_path = os.path.join( + dir_io, SHARED_STATEMENTS_FNAME.format( + catalog=catalog, entity=entity, criterion=criterion + ) + ) + extra_path = os.path.join( + dir_io, BIO_STATEMENTS_TO_BE_ADDED_FNAME.format( + catalog=catalog, entity=entity + ) + ) + wd_cache_path = os.path.join( + dir_io, WD_CACHE_FNAME.format( + catalog=catalog, entity=entity, criterion=criterion + ) ) - wd_bio_path = os.path.join(dir_io, WD_BIO_FILENAME.format(catalog, entity)) # Handle Wikidata cache - if os.path.isfile(wd_bio_path): - with open(wd_bio_path) as wdin: - wd_bio = _load_wd_cache(wdin) + if os.path.isfile(wd_cache_path): + with open(wd_cache_path, 'rb') as cin: + wd_cache = pickle.load(cin) + LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache - to_be_deprecated, to_be_added, _ = bio(catalog, entity, wd_cache=wd_bio) + deprecate, shared, extra, _ = bio(catalog, entity, wd_cache=wd_cache) else: - to_be_deprecated, to_be_added, wd_bio = bio(catalog, entity) + deprecate, shared, extra, wd_cache = bio(catalog, entity) # Nothing to do: the catalog doesn't contain biographical data - if to_be_deprecated is None: + if deprecate is None: return # Dump Wikidata cache if dump_wikidata: - _dump_wd_cache(wd_bio, wd_bio_path) + with open(wd_cache_path, 'wb') as cout: + pickle.dump(wd_cache, cout) LOGGER.info( - 'Biographical data gathered from Wikidata dumped to %s', wd_bio_path + 'Biographical data gathered from Wikidata dumped to %s', + wd_cache_path ) # Dump output files - _dump_deprecated(to_be_deprecated, deprecated_path) - _dump_csv_output(to_be_added, statements_path, 'statements') + _dump_deprecated(deprecate, deprecate_path) + _dump_csv_output(shared, shared_path, 'shared statements') + _dump_csv_output(extra, extra_path, 'extra statements') # Upload the output to Wikidata if upload: @@ -319,12 +358,15 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2 ) - criterion = 'bio' LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers( - 'deprecate', catalog, entity, to_be_deprecated, sandbox + 'deprecate', catalog, entity, deprecate, sandbox ) - LOGGER.info('Starting addition of statements to Wikidata ...') + LOGGER.info('Starting referencing of shared statements in Wikidata ...') + wikidata_bot.add_people_statements( + catalog, to_be_added, criterion, sandbox + ) + LOGGER.info('Starting addition of extra statements to Wikidata ...') wikidata_bot.add_people_statements( catalog, to_be_added, criterion, sandbox ) @@ -454,7 +496,7 @@ def links( if target_links is None: return None, None, None, None - to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set) + deprecate, add = defaultdict(set), defaultdict(set) # Wikidata side url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids() @@ -473,13 +515,13 @@ def links( wd_links = wd_cache # Validation - _validate(keys.LINKS, wd_links, target_links, to_be_deprecated, to_be_added) + _validate(keys.LINKS, wd_links, target_links, deprecate, add) # Separate external IDs from URLs ( ext_ids_to_be_added, urls_to_be_added, - ) = data_gathering.extract_ids_from_urls(to_be_added, ext_id_pids_to_urls) + ) = data_gathering.extract_ids_from_urls(add, ext_id_pids_to_urls) # Apply URL blacklist if url_blacklist: @@ -492,17 +534,17 @@ def links( 'URL statements to be added: %d', catalog, entity, - len(to_be_deprecated), + len(deprecate), len(ext_ids_to_be_added), len(urls_to_be_added), ) - return to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, wd_links + return deprecate, ext_ids_to_be_added, urls_to_be_added, wd_links def bio( catalog: str, entity: str, wd_cache=None -) -> Union[Tuple[defaultdict, Iterator, dict], Tuple[None, None, None]]: +) -> Union[Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None]]: """Validate identifiers against available biographical data. Look for: @@ -533,20 +575,21 @@ def bio( A supported entity :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata in a previous run - :return: 3 objects + :return: a ``tuple`` of 4 objects 1. ``dict`` of identifiers that should be deprecated - 2. ``generator`` of statements that should be added - 3. ``dict`` of biographical data gathered from Wikidata + 2. ``generator`` of shared statements that should be referenced + 3. ``generator`` of statements that should be added + 4. ``dict`` of biographical data gathered from Wikidata """ # Target catalog side first: # enable early return in case of no target data target_bio = data_gathering.gather_target_biodata(entity, catalog) if target_bio is None: - return None, None, None + return None, None, None, None - to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set) + deprecate, reference, add = defaultdict(set), defaultdict(set), defaultdict(set) # Wikidata side if wd_cache is None: @@ -562,9 +605,13 @@ def bio( wd_bio = wd_cache # Validation - _validate(keys.BIODATA, wd_bio, target_bio, to_be_deprecated, to_be_added) + _validate( + keys.BIODATA, + wd_bio, target_bio, + deprecate, reference, add + ) - return to_be_deprecated, _bio_to_be_added_generator(to_be_added), wd_bio + return deprecate, _bio_statements_generator(reference), _bio_statements_generator(add), wd_bio def _apply_url_blacklist(url_statements): @@ -588,13 +635,13 @@ def _apply_url_blacklist(url_statements): return url_statements -def _bio_to_be_added_generator(to_be_added): - for (qid, tid,), values in to_be_added.items(): +def _bio_statements_generator(stmts_dict): + for (qid, tid), values in stmts_dict.items(): for pid, value in values: yield qid, pid, value, tid -def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): +def _validate(criterion, wd, target_generator, deprecate, reference, add): LOGGER.info('Starting check against target %s ...', criterion) target = _consume_target_generator(target_generator) @@ -641,7 +688,7 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): qid, tid, ) - to_be_deprecated[tid].add(qid) + deprecate[tid].add(qid) else: LOGGER.debug( '%s and %s share these %s: %s', @@ -650,6 +697,7 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): criterion, shared_data, ) + reference[(qid, tid)].update(shared_data) if extra_data: LOGGER.debug( @@ -659,16 +707,18 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added): qid, extra_data, ) - to_be_added[(qid, tid,)].update(extra_data) + add[(qid, tid)].update(extra_data) else: LOGGER.debug('%s has no extra %s', tid, criterion) LOGGER.info( 'Check against target %s completed: %d IDs to be deprecated, ' + '%d Wikidata items with shared statements to be referenced, ', '%d Wikidata items with statements to be added', criterion, - len(to_be_deprecated), - len(to_be_added), + len(deprecate), + len(reference), + len(add), ) @@ -678,10 +728,17 @@ def _compute_shared_and_extra(criterion, wd_data, target_data): extra = target_data.difference(wd_data) # Biographical validation requires more complex comparisons elif criterion == keys.BIODATA: - wd_dates, wd_other = _extract_dates(wd_data) - target_dates, target_other = _extract_dates(target_data) + # `wd_data` has either couples or triples: couples are dates + wd_dates = set(filter(lambda x: len(x) == 2, wd_data)) + # No cast to `set` because `wd_data` triples hold sets themselves + wd_other = list(filter(lambda x: len(x) == 3, wd_data)) + # In `target_data` we look for relevant date PIDs + target_dates = set(filter( + lambda x: x[0] in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH), + target_data + )) + target_other = target_data.difference(target_dates) shared_dates, extra_dates = _compare('dates', wd_dates, target_dates) - import ipdb; ipdb.set_trace() shared_other, extra_other = _compare('other', wd_other, target_other) shared = shared_dates | shared_other extra = extra_dates | extra_other @@ -694,15 +751,6 @@ def _compute_shared_and_extra(criterion, wd_data, target_data): return shared, extra -def _extract_dates(data): - dates = set() - for pid, value in data: - if pid in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH): - dates.add((pid, value)) - # Separate dates from other data - return dates, data.difference(dates) - - def _compare(what, wd, target): shared, extra = set(), set() # Keep track of matches to avoid useless computation @@ -751,14 +799,17 @@ def _compare_other(inputs): pid, qid, wd_values = wd_elem _, t_value = t_elem + # Take the lowercased normalized value # TODO improve matching - if text_utils.normalize(t_value) in wd_values: + _, t_normalized = text_utils.normalize(t_value) + if t_normalized in wd_values: shared.add((pid, qid)) wd_matches.append(i) target_matches.append(j) else: - # TODO resolve target string into QID - extra.add((pid, t_value)) + t_qid = api_requests.resolve_qid(t_normalized) + if t_qid is not None: + extra.add((pid, t_qid)) def _compare_dates(inputs): @@ -838,51 +889,6 @@ def _dump_csv_output(data, outpath, log_msg_subject): LOGGER.info("No %s to be added, won't dump to file", log_msg_subject) -# FIXME adapt to new data model -def _load_wd_cache(file_handle): - raw_cache = json.load(file_handle) - cache = {} - for qid, data in raw_cache.items(): - for data_type, value_list in data.items(): - # Biodata has values that are a list - if value_list and isinstance(value_list[0], list): - value_set = set() - for value in value_list: - if isinstance(value[1], list): - same_pid, different_values = value[0], value[1] - for val in different_values: - value_set.add((same_pid, val)) - else: - value_set.add(tuple(value)) - if cache.get(qid): - cache[qid][data_type] = value_set - else: - cache[qid] = {data_type: value_set} - else: - if cache.get(qid): - cache[qid][data_type] = set(value_list) - else: - cache[qid] = {data_type: set(value_list)} - LOGGER.info("Loaded Wikidata cache from '%s'", file_handle.name) - return cache - - -def _dump_wd_cache(cache, outpath): - with open(outpath, 'w') as outfile: - json.dump( - { - qid: { - data_type: values - for data_type, values in data.items() - } - for qid, data in cache.items() - }, - outfile, - indent=2, - ensure_ascii=False, - ) - - def _consume_target_generator(target_generator): target = defaultdict(set) for identifier, *data in target_generator: From cbc36b9d40fb2f2efdd5300c83b4e33b01e87d03 Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Thu, 12 Aug 2021 16:03:43 +0200 Subject: [PATCH 17/22] use P2888 for bare URLs statements, closes #409 --- soweego/commons/data_gathering.py | 2 +- soweego/wikidata/vocabulary.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py index 86a150ee..15085635 100644 --- a/soweego/commons/data_gathering.py +++ b/soweego/commons/data_gathering.py @@ -527,7 +527,7 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls): ext_ids_to_add.append((qid, pid, ext_id, tid,)) else: urls_to_add.append( - (qid, vocabulary.DESCRIBED_AT_URL, url, tid,) + (qid, vocabulary.EXACT_MATCH, url, tid,) ) return ( ext_ids_to_add, diff --git a/soweego/wikidata/vocabulary.py b/soweego/wikidata/vocabulary.py index 7219e5fc..421204af 100644 --- a/soweego/wikidata/vocabulary.py +++ b/soweego/wikidata/vocabulary.py @@ -53,6 +53,9 @@ # Widely used generic property to hold URLs DESCRIBED_AT_URL = 'P973' OFFICIAL_WEBSITE = 'P856' +# See BrokenSegue's comment at +# https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/08#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005) +EXACT_MATCH = 'P2888' # Class QID of supported entities # People From 8205c80a8d23bd0e282e15bf89d1a20c2aa647c7 Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Thu, 12 Aug 2021 16:22:38 +0200 Subject: [PATCH 18/22] validation order: deprecate, add, reference; links validation: dump shared statements; revisit & rename variables --- soweego/validator/checks.py | 177 +++++++++++++++++++++--------------- 1 file changed, 102 insertions(+), 75 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index 88011978..ce0349ea 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -23,7 +23,7 @@ from soweego.commons import constants, data_gathering, keys, target_database, text_utils from soweego.commons.db_manager import DBManager from soweego.ingester import wikidata_bot -from soweego.wikidata import vocabulary +from soweego.wikidata import vocabulary, api_requests from soweego.wikidata.api_requests import get_url_blacklist LOGGER = logging.getLogger(__name__) @@ -36,8 +36,8 @@ # For `dead_ids_cli` DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json' # For `links_cli` -EXT_IDS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_external_ids_to_be_added.csv' -URLS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_urls_to_be_added.csv' +EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv' +URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv' # For `bio_cli` BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv' @@ -159,13 +159,12 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): default=constants.SHARED_FOLDER, help=f'Input/output directory, default: {constants.SHARED_FOLDER}.', ) -# TODO adapt to also dump shared links def links_cli( catalog, entity, blacklist, upload, sandbox, dump_wikidata, dir_io ): """Validate identifiers against links. - Dump 3 output files: + Dump 5 output files: 1. catalog IDs to be deprecated. JSON format: {catalog_ID: [list of QIDs]} @@ -174,11 +173,15 @@ def links_cli( QID,third-party_PID,third-party_ID,catalog_ID 3. URLs to be added. CSV format: - QID,P973,URL,catalog_ID + QID,P2888,URL,catalog_ID + + 4. third-party IDs to be referenced. Same format as file #2 + + 5. URLs to be referenced. Same format as file #3 You can pass the '-u' flag to upload the output to Wikidata. - The '-b' flag applies a URL blacklist of low-quality Web domains. + The '-b' flag applies a URL blacklist of low-quality Web domains to file #3. """ criterion = 'links' # Output paths @@ -187,14 +190,24 @@ def links_cli( catalog=catalog, entity=entity, criterion=criterion ) ) - ids_path = os.path.join( - dir_io, EXT_IDS_TO_BE_ADDED_FNAME.format( - catalog=catalog, entity=entity + add_ext_ids_path = os.path.join( + dir_io, EXT_IDS_FNAME.format( + catalog=catalog, entity=entity, task='added' ) ) - urls_path = os.path.join( - dir_io, URLS_TO_BE_ADDED_FNAME.format( - catalog=catalog, entity=entity + add_urls_path = os.path.join( + dir_io, URLS_FNAME.format( + catalog=catalog, entity=entity, task='added' + ) + ) + ref_ext_ids_path = os.path.join( + dir_io, EXT_IDS_FNAME.format( + catalog=catalog, entity=entity, task='referenced' + ) + ) + ref_urls_path = os.path.join( + dir_io, URLS_FNAME.format( + catalog=catalog, entity=entity, task='referenced' ) ) wd_cache_path = os.path.join( @@ -209,19 +222,16 @@ def links_cli( wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache - ids_to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, _ = links( + deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, _ = links( catalog, entity, blacklist, wd_cache=wd_cache ) else: - ( - ids_to_be_deprecated, - ext_ids_to_be_added, - urls_to_be_added, - wd_cache, - ) = links(catalog, entity, blacklist) + deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links( + catalog, entity, blacklist + ) # Nothing to do: the catalog doesn't contain links - if ids_to_be_deprecated is None: + if deprecate is None: return # Dump Wikidata cache @@ -231,9 +241,11 @@ def links_cli( LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path) # Dump output files - _dump_deprecated(ids_to_be_deprecated, deprecate_path) - _dump_csv_output(ext_ids_to_be_added, ids_path, 'third-party IDs') - _dump_csv_output(urls_to_be_added, urls_path, 'URLs') + _dump_deprecated(deprecate, deprecate_path) + _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added') + _dump_csv_output(add_urls, add_urls_path, 'URLs to be added') + _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced') + _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') # Upload the output to Wikidata if upload: @@ -244,15 +256,23 @@ def links_cli( ) LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers( - 'deprecate', catalog, entity, ids_to_be_deprecated, sandbox + 'deprecate', catalog, entity, deprecate, sandbox ) LOGGER.info('Starting addition of external IDs to Wikidata ...') wikidata_bot.add_people_statements( - catalog, ext_ids_to_be_added, criterion, sandbox + catalog, add_ext_ids, criterion, sandbox + ) + LOGGER.info('Starting addition of URLs to Wikidata ...') + wikidata_bot.add_people_statements( + catalog, add_urls, criterion, sandbox + ) + LOGGER.info('Starting referencing of shared external IDs in Wikidata ...') + wikidata_bot.add_people_statements( + catalog, add_ext_ids, criterion, sandbox ) - LOGGER.info('Starting addition of statements to Wikidata ...') + LOGGER.info('Starting referencing of shared URLs in Wikidata ...') wikidata_bot.add_people_statements( - catalog, urls_to_be_added, criterion, sandbox + catalog, add_urls, criterion, sandbox ) @@ -293,11 +313,10 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): 1. catalog IDs to be deprecated. JSON format: {catalog_ID: [list of QIDs]} - 2. shared statements. CSV format: + 2. statements to be added. CSV format: QID,PID,value,catalog_ID - 3. statements to be added. CSV format: - QID,PID,value,catalog_ID + 3. shared statements to be referenced. Same format as file #2 You can pass the '-u' flag to upload the output to Wikidata. """ @@ -307,16 +326,16 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): catalog=catalog, entity=entity, criterion=criterion ) ) - shared_path = os.path.join( - dir_io, SHARED_STATEMENTS_FNAME.format( - catalog=catalog, entity=entity, criterion=criterion - ) - ) - extra_path = os.path.join( + add_path = os.path.join( dir_io, BIO_STATEMENTS_TO_BE_ADDED_FNAME.format( catalog=catalog, entity=entity ) ) + ref_path = os.path.join( + dir_io, SHARED_STATEMENTS_FNAME.format( + catalog=catalog, entity=entity, criterion=criterion + ) + ) wd_cache_path = os.path.join( dir_io, WD_CACHE_FNAME.format( catalog=catalog, entity=entity, criterion=criterion @@ -329,9 +348,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): wd_cache = pickle.load(cin) LOGGER.info("Loaded Wikidata cache from '%s'", cin.name) # Discard the last return value: Wikidata cache - deprecate, shared, extra, _ = bio(catalog, entity, wd_cache=wd_cache) + deprecate, add, reference, _ = bio(catalog, entity, wd_cache=wd_cache) else: - deprecate, shared, extra, wd_cache = bio(catalog, entity) + deprecate, add, reference, wd_cache = bio(catalog, entity) # Nothing to do: the catalog doesn't contain biographical data if deprecate is None: @@ -348,10 +367,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): # Dump output files _dump_deprecated(deprecate, deprecate_path) - _dump_csv_output(shared, shared_path, 'shared statements') - _dump_csv_output(extra, extra_path, 'extra statements') + _dump_csv_output(add, add_path, 'statements to be added') + _dump_csv_output(reference, ref_path, 'shared statements to be referenced') - # Upload the output to Wikidata + # Upload the output to Wikidata: + # deprecate, add, reference if upload: if sandbox: LOGGER.info( @@ -362,13 +382,13 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): wikidata_bot.delete_or_deprecate_identifiers( 'deprecate', catalog, entity, deprecate, sandbox ) - LOGGER.info('Starting referencing of shared statements in Wikidata ...') + LOGGER.info('Starting addition of extra statements to Wikidata ...') wikidata_bot.add_people_statements( - catalog, to_be_added, criterion, sandbox + catalog, add, criterion, sandbox ) - LOGGER.info('Starting addition of extra statements to Wikidata ...') + LOGGER.info('Starting referencing of shared statements in Wikidata ...') wikidata_bot.add_people_statements( - catalog, to_be_added, criterion, sandbox + catalog, reference, criterion, sandbox ) @@ -453,7 +473,7 @@ def dead_ids( def links( catalog: str, entity: str, url_blacklist=False, wd_cache=None -) -> Union[Tuple[defaultdict, list, list, dict], Tuple[None, None, None, None]]: +) -> Union[Tuple[defaultdict, list, list, list, list, dict], Tuple[None, None, None, None, None, None]]: """Validate identifiers against available links. Also generate statements based on additional links @@ -482,21 +502,23 @@ def links( of URL domains. Default: ``False`` :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata in a previous run. Default: ``None`` - :return: 4 objects + :return: ``tuple`` of 6 objects 1. ``dict`` of identifiers that should be deprecated 2. ``list`` of third-party identifiers that should be added 3. ``list`` of URLs that should be added - 4. ``dict`` of links gathered from Wikidata + 4. ``list`` of third-party identifiers that should be referenced + 5. ``list`` of URLs that should be referenced + 6. ``dict`` of links gathered from Wikidata """ # Target catalog side first: # enable early return in case of no target links target_links = data_gathering.gather_target_links(entity, catalog) if target_links is None: - return None, None, None, None + return None, None, None, None, None, None - deprecate, add = defaultdict(set), defaultdict(set) + deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set) # Wikidata side url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids() @@ -515,31 +537,36 @@ def links( wd_links = wd_cache # Validation - _validate(keys.LINKS, wd_links, target_links, deprecate, add) + _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference) - # Separate external IDs from URLs - ( - ext_ids_to_be_added, - urls_to_be_added, - ) = data_gathering.extract_ids_from_urls(add, ext_id_pids_to_urls) - - # Apply URL blacklist + # Links to be added: + # 1. Separate external IDs from URLs + add_ext_ids, add_urls = data_gathering.extract_ids_from_urls( + add, ext_id_pids_to_urls + ) + # 2. Apply URL blacklist if url_blacklist: - urls_to_be_added = _apply_url_blacklist(urls_to_be_added) + add_urls = _apply_url_blacklist(add_urls) + + # Links to be referenced: separate external IDs from URLs + ref_ext_ids, ref_urls = data_gathering.extract_ids_from_urls( + reference, ext_id_pids_to_urls + ) LOGGER.info( 'Validation completed. Target: %s %s. ' 'IDs to be deprecated: %d. ' 'Third-party IDs to be added: %d. ' 'URL statements to be added: %d', - catalog, - entity, + 'Third-party IDs to be referenced: %d. ' + 'URL statements to be referenced: %d', + catalog, entity, len(deprecate), - len(ext_ids_to_be_added), - len(urls_to_be_added), + len(add_ext_ids), len(add_urls), + len(ref_ext_ids), len(ref_urls) ) - return deprecate, ext_ids_to_be_added, urls_to_be_added, wd_links + return deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_links def bio( @@ -578,8 +605,8 @@ def bio( :return: a ``tuple`` of 4 objects 1. ``dict`` of identifiers that should be deprecated - 2. ``generator`` of shared statements that should be referenced - 3. ``generator`` of statements that should be added + 2. ``generator`` of statements that should be added + 3. ``generator`` of shared statements that should be referenced 4. ``dict`` of biographical data gathered from Wikidata """ @@ -589,7 +616,7 @@ def bio( if target_bio is None: return None, None, None, None - deprecate, reference, add = defaultdict(set), defaultdict(set), defaultdict(set) + deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set) # Wikidata side if wd_cache is None: @@ -608,10 +635,10 @@ def bio( _validate( keys.BIODATA, wd_bio, target_bio, - deprecate, reference, add + deprecate, add, reference ) - return deprecate, _bio_statements_generator(reference), _bio_statements_generator(add), wd_bio + return deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), wd_bio def _apply_url_blacklist(url_statements): @@ -641,7 +668,7 @@ def _bio_statements_generator(stmts_dict): yield qid, pid, value, tid -def _validate(criterion, wd, target_generator, deprecate, reference, add): +def _validate(criterion, wd, target_generator, deprecate, add, reference): LOGGER.info('Starting check against target %s ...', criterion) target = _consume_target_generator(target_generator) @@ -713,12 +740,12 @@ def _validate(criterion, wd, target_generator, deprecate, reference, add): LOGGER.info( 'Check against target %s completed: %d IDs to be deprecated, ' - '%d Wikidata items with shared statements to be referenced, ', '%d Wikidata items with statements to be added', + '%d Wikidata items with shared statements to be referenced, ', criterion, len(deprecate), - len(reference), len(add), + len(reference), ) @@ -884,9 +911,9 @@ def _dump_csv_output(data, outpath, log_msg_subject): with open(outpath, 'w') as ids_out: writer = csv.writer(ids_out) writer.writerows(data) - LOGGER.info('%s to be added dumped to %s', log_msg_subject, outpath) + LOGGER.info('%s dumped to %s', log_msg_subject, outpath) else: - LOGGER.info("No %s to be added, won't dump to file", log_msg_subject) + LOGGER.info("No %s, won't dump to file", log_msg_subject) def _consume_target_generator(target_generator): From 20906aa7accc2179afe9f9512de05a7239ab2a61 Mon Sep 17 00:00:00 2001 From: Marco at VPS Date: Fri, 13 Aug 2021 09:43:43 +0000 Subject: [PATCH 19/22] dump output first, then eventual cache; use highest pickle protocol; fix log formatting exception --- soweego/validator/checks.py | 67 +++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index ce0349ea..ff967438 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -99,14 +99,6 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): else: dead, wd_cache = dead_ids(catalog, entity) - # Dump Wikidata cache - if dump_wikidata: - with open(wd_cache_path, 'wb') as cout: - pickle.dump(wd_cache, cout) - LOGGER.info( - 'Identifiers gathered from Wikidata dumped to %s', wd_cache_path - ) - # Dump dead ids with open(dead_ids_path, 'w') as fout: # Sets are not serializable to JSON, so cast them to lists @@ -115,9 +107,21 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): fout, indent=2, ) - LOGGER.info('Dead identifiers dumped to %s', dead_ids_path) + # Dump Wikidata cache + if dump_wikidata: + try: + with open(wd_cache_path, 'wb') as cout: + # Using the highest protocol available for the current Python + # version should be the most efficient solution + pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) + LOGGER.info( + 'Identifiers gathered from Wikidata dumped to %s', wd_cache_path + ) + except MemoryError: + LOGGER.warning('Could not pickle the Wikidata cache: memory error') + # Deprecate dead ids in Wikidata if deprecate: LOGGER.info('Starting deprecation of %s IDs ...', catalog) @@ -234,12 +238,6 @@ def links_cli( if deprecate is None: return - # Dump Wikidata cache - if dump_wikidata: - with open(wd_cache_path, 'wb') as cout: - pickle.dump(wd_cache, cout) - LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path) - # Dump output files _dump_deprecated(deprecate, deprecate_path) _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added') @@ -247,6 +245,19 @@ def links_cli( _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced') _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') + # Dump Wikidata cache + if dump_wikidata: + try: + with open(wd_cache_path, 'wb') as cout: + # Using the highest protocol available for the current Python + # version should be the most efficient solution + pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) + LOGGER.info( + 'URLs gathered from Wikidata dumped to %s', wd_cache_path + ) + except MemoryError: + LOGGER.warning('Could not pickle the Wikidata cache: memory error') + # Upload the output to Wikidata if upload: if sandbox: @@ -356,20 +367,24 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): if deprecate is None: return - # Dump Wikidata cache - if dump_wikidata: - with open(wd_cache_path, 'wb') as cout: - pickle.dump(wd_cache, cout) - LOGGER.info( - 'Biographical data gathered from Wikidata dumped to %s', - wd_cache_path - ) - # Dump output files _dump_deprecated(deprecate, deprecate_path) _dump_csv_output(add, add_path, 'statements to be added') _dump_csv_output(reference, ref_path, 'shared statements to be referenced') + # Dump Wikidata cache + if dump_wikidata: + try: + with open(wd_cache_path, 'wb') as cout: + # Using the highest protocol available for the current Python + # version should be the most efficient solution + pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) + LOGGER.info( + 'Biographical data gathered from Wikidata dumped to %s', wd_cache_path + ) + except MemoryError: + LOGGER.warning('Could not pickle the Wikidata cache: memory error') + # Upload the output to Wikidata: # deprecate, add, reference if upload: @@ -740,8 +755,8 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference): LOGGER.info( 'Check against target %s completed: %d IDs to be deprecated, ' - '%d Wikidata items with statements to be added', - '%d Wikidata items with shared statements to be referenced, ', + '%d Wikidata items with statements to be added, ', + '%d Wikidata items with shared statements to be referenced', criterion, len(deprecate), len(add), From 39345a6a82d9ca0f67ad7df94f6400942f45dbac Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Wed, 18 Aug 2021 13:04:43 +0200 Subject: [PATCH 20/22] better type annotation & docstring --- soweego/wikidata/api_requests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py index 9cdff417..c5e6ad75 100644 --- a/soweego/wikidata/api_requests.py +++ b/soweego/wikidata/api_requests.py @@ -10,7 +10,7 @@ from collections import defaultdict from functools import lru_cache, partial from multiprocessing.pool import Pool -from typing import Dict, Iterator, List, Set, TextIO, Tuple, Union +from typing import Dict, Iterator, List, Optional, Set, TextIO, Tuple, Union from urllib.parse import urlunsplit import lxml.html @@ -39,13 +39,13 @@ BUCKET_SIZE = 500 -def resolve_qid(term: str, language='en') -> str: +def resolve_qid(term: str, language='en') -> Optional[str]: """Try to resolve a QID given a search term, in a *feeling lucky* way. :param term: a search term :param language: (optional) search in the given language code. Default: ``en``. - :return: the QID of the first result + :return: the QID of the first result, or ``None`` in case of no result """ params = { 'action': 'wbsearchentities', From fa16a4b45930d41da67f4e3fa7aa76c81c485589 Mon Sep 17 00:00:00 2001 From: Marco Fossati Date: Wed, 18 Aug 2021 13:09:13 +0200 Subject: [PATCH 21/22] better type annotation & docstring again --- soweego/wikidata/api_requests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py index c5e6ad75..fb98dc8f 100644 --- a/soweego/wikidata/api_requests.py +++ b/soweego/wikidata/api_requests.py @@ -76,10 +76,11 @@ def resolve_qid(term: str, language='en') -> Optional[str]: return None -def get_url_blacklist() -> set: +def get_url_blacklist() -> Optional[set]: """Retrieve a blacklist with URL domains of low-quality sources. - :return: the set of blacklisted domains + :return: the set of blacklisted domains, + or ``None`` in case of issues with the Wikidata Web API """ params = { 'action': 'parse', From 661e20bc36635047e1b6e8754670d407850c8632 Mon Sep 17 00:00:00 2001 From: travis Date: Wed, 18 Aug 2021 11:30:45 +0000 Subject: [PATCH 22/22] format code & organize imports --- soweego/commons/data_gathering.py | 20 ++-- soweego/ingester/wikidata_bot.py | 154 +++++++++++++++++--------- soweego/linker/baseline.py | 4 +- soweego/validator/checks.py | 174 +++++++++++++++++++----------- soweego/wikidata/api_requests.py | 5 +- 5 files changed, 228 insertions(+), 129 deletions(-) diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py index 15085635..30bdb752 100644 --- a/soweego/commons/data_gathering.py +++ b/soweego/commons/data_gathering.py @@ -20,7 +20,13 @@ from sqlalchemy import or_ from tqdm import tqdm -from soweego.commons import constants, keys, target_database, text_utils, url_utils +from soweego.commons import ( + constants, + keys, + target_database, + text_utils, + url_utils, +) from soweego.commons.db_manager import DBManager from soweego.importer import models from soweego.wikidata import api_requests, sparql_queries, vocabulary @@ -395,15 +401,15 @@ def gather_wikidata_biodata(wikidata): # (non-lower, lower): take the lowercased one labels = {text_utils.normalize(label)[1] for label in parsed} # e.g., (P19, Q641, {'venezia', 'venice', ...}) - wikidata[qid][keys.BIODATA].append( - (pid, v_qid, labels) - ) + wikidata[qid][keys.BIODATA].append((pid, v_qid, labels)) # If `parsed` is a tuple, we have a (timestamp, precision) date elif isinstance(parsed, tuple): timestamp, precision = parsed[0], parsed[1] # Get rid of time, useless timestamp = timestamp.split('T')[0] - wikidata[qid][keys.BIODATA].append((pid, f'{timestamp}/{precision}')) + wikidata[qid][keys.BIODATA].append( + (pid, f'{timestamp}/{precision}') + ) else: wikidata[qid][keys.BIODATA].append((pid, parsed)) total += 1 @@ -526,9 +532,7 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls): if ext_id is not None: ext_ids_to_add.append((qid, pid, ext_id, tid,)) else: - urls_to_add.append( - (qid, vocabulary.EXACT_MATCH, url, tid,) - ) + urls_to_add.append((qid, vocabulary.EXACT_MATCH, url, tid,)) return ( ext_ids_to_add, urls_to_add, diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py index 30ad56f1..0065b2b7 100644 --- a/soweego/ingester/wikidata_bot.py +++ b/soweego/ingester/wikidata_bot.py @@ -112,8 +112,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox): """ if sandbox: LOGGER.info( - 'Running on the Wikidata sandbox item %s ...', - vocabulary.SANDBOX_2 + 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2 ) delete_or_deprecate_identifiers( @@ -141,8 +140,7 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox): """ if sandbox: LOGGER.info( - 'Running on the Wikidata sandbox item %s ...', - vocabulary.SANDBOX_2 + 'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2 ) delete_or_deprecate_identifiers( @@ -193,7 +191,7 @@ def identifiers_cli(catalog, entity, identifiers, sandbox): '--criterion', type=click.Choice(('links', 'bio')), help='Validation criterion used to generate STATEMENTS. ' - 'Same as the command passed to `python -m soweego sync`' + 'Same as the command passed to `python -m soweego sync`', ) @click.option( '-s', @@ -235,9 +233,7 @@ def people_cli(catalog, statements, criterion, sandbox): edit_summary = None if sandbox: - LOGGER.info( - 'Running on the Wikidata sandbox item %s ...', sandbox_item - ) + LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item) stmt_reader = csv.reader(statements) for person, predicate, value, catalog_id in stmt_reader: @@ -248,7 +244,7 @@ def people_cli(catalog, statements, criterion, sandbox): catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) @@ -295,8 +291,11 @@ def works_cli(catalog, statements, sandbox): _add_or_reference_works( (subject, predicate, person), heuristic, - catalog_qid, person_pid, person_id, - is_imdb=is_imdb, edit_summary=WORKS_SUMMARY + catalog_qid, + person_pid, + person_id, + is_imdb=is_imdb, + edit_summary=WORKS_SUMMARY, ) @@ -327,7 +326,8 @@ def add_identifiers( _add_or_reference( (subject, catalog_pid, tid,), heuristic, - edit_summary=IDENTIFIERS_SUMMARY) + edit_summary=IDENTIFIERS_SUMMARY, + ) def add_people_statements( @@ -368,7 +368,10 @@ def add_people_statements( for subject, predicate, value, catalog_id in statements: LOGGER.info( 'Processing (%s, %s, %s, %s) statement ...', - subject, predicate, value, catalog_id + subject, + predicate, + value, + catalog_id, ) actual_subject = subject if not sandbox else sandbox_item _add_or_reference( @@ -377,7 +380,7 @@ def add_people_statements( catalog_qid=catalog_qid, catalog_pid=person_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) @@ -407,7 +410,10 @@ def add_works_statements( for work, predicate, person, person_id in statements: LOGGER.info( 'Processing (%s, %s, %s, %s) statement', - work, predicate, person, person_id + work, + predicate, + person, + person_id, ) subject = work if not sandbox else sandbox_item _add_or_reference_works( @@ -417,7 +423,7 @@ def add_works_statements( person_pid, person_id, is_imdb=is_imdb, - edit_summary=WORKS_SUMMARY + edit_summary=WORKS_SUMMARY, ) @@ -456,15 +462,23 @@ def delete_or_deprecate_identifiers( _delete_or_deprecate(action, actual_qid, tid, catalog, catalog_pid) -def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, catalog_pid: str, catalog_id: str, - is_imdb=False, edit_summary=None) -> None: +def _add_or_reference_works( + statement: tuple, + heuristic: str, + catalog_qid: str, + catalog_pid: str, + catalog_id: str, + is_imdb=False, + edit_summary=None, +) -> None: work, predicate, person = statement # Parse value into an item in case of QID qid = match(QID_REGEX, person) if not qid: LOGGER.warning( "%s doesn't look like a QID, won't try to add the %s statement", - person, statement + person, + statement, ) return person_item = pywikibot.ItemPage(REPO, qid.group()) @@ -490,7 +504,7 @@ def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ): return @@ -508,15 +522,21 @@ def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, def _add_or_reference( - statement, heuristic, - catalog_qid=None, catalog_pid=None, catalog_id=None, - edit_summary=None + statement, + heuristic, + catalog_qid=None, + catalog_pid=None, + catalog_id=None, + edit_summary=None, ) -> None: subject, predicate, value = statement subject_item, claims = _essential_checks( - statement, heuristic, - catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + statement, + heuristic, + catalog_qid=catalog_qid, + catalog_pid=catalog_pid, + catalog_id=catalog_id, + edit_summary=edit_summary, ) if None in (subject_item, claims): @@ -533,7 +553,7 @@ def _add_or_reference( edit_summary=edit_summary, catalog_qid=catalog_qid, catalog_pid=catalog_pid, - catalog_id=catalog_id + catalog_id=catalog_id, ): return @@ -554,7 +574,7 @@ def _add_or_reference( catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) @@ -577,12 +597,14 @@ def _handle_addition( if not given_predicate_claims: LOGGER.debug('%s has no %s claim', subject_qid, predicate) _add( - subject_item, predicate, value, + subject_item, + predicate, + value, heuristic, catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) return @@ -605,12 +627,14 @@ def _handle_addition( '%s has no %s claim with value %s', subject_qid, predicate, value ) _add( - subject_item, predicate, value, + subject_item, + predicate, + value, heuristic, catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) return @@ -621,12 +645,26 @@ def _handle_addition( if case_insensitive: for claim in given_predicate_claims: if claim.getTarget().lower() == value: - _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary) + _reference( + claim, + heuristic, + catalog_qid, + catalog_pid, + catalog_id, + edit_summary=edit_summary, + ) return for claim in given_predicate_claims: if claim.getTarget() == value: - _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary) + _reference( + claim, + heuristic, + catalog_qid, + catalog_pid, + catalog_id, + edit_summary=edit_summary, + ) def _handle_redirect_and_dead(qid): @@ -662,11 +700,14 @@ def _essential_checks( if not data: LOGGER.warning('%s has no data at all', subject) _add( - item, predicate, value, heuristic, + item, + predicate, + value, + heuristic, catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) return None, None @@ -675,11 +716,14 @@ def _essential_checks( if not claims: LOGGER.warning('%s has no claims', subject) _add( - item, predicate, value, heuristic, + item, + predicate, + value, + heuristic, catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) return None, None @@ -707,11 +751,12 @@ def _check_for_same_value( value, ) _reference( - claim, heuristic, + claim, + heuristic, catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id, - edit_summary=edit_summary + edit_summary=edit_summary, ) return True return False @@ -762,15 +807,26 @@ def _add( claim.setTarget(value) subject_item.addClaim(claim, summary=edit_summary) LOGGER.debug('Added claim: %s', claim.toJSON()) - _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary) + _reference( + claim, + heuristic, + catalog_qid, + catalog_pid, + catalog_id, + edit_summary=edit_summary, + ) LOGGER.info( 'Added (%s, %s, %s) statement', subject_item.getID(), predicate, value ) def _reference( - claim: pywikibot.Claim, heuristic: str, - catalog_qid=None, catalog_pid=None, catalog_id=None, edit_summary=None + claim: pywikibot.Claim, + heuristic: str, + catalog_qid=None, + catalog_pid=None, + catalog_id=None, + edit_summary=None, ): reference_node, log_buffer = [], [] @@ -783,9 +839,7 @@ def _reference( based_on_heuristic_reference = pywikibot.Claim( REPO, vocabulary.BASED_ON_HEURISTIC, is_reference=True ) - based_on_heuristic_reference.setTarget( - pywikibot.ItemPage(REPO, heuristic) - ) + based_on_heuristic_reference.setTarget(pywikibot.ItemPage(REPO, heuristic)) reference_node.append(based_on_heuristic_reference) log_buffer.append(f'({based_on_heuristic_reference.getID()}, {heuristic})') @@ -801,7 +855,9 @@ def _reference( if catalog_pid is not None and catalog_id is not None: # (catalog property, catalog ID) reference claim - catalog_id_reference = pywikibot.Claim(REPO, catalog_pid, is_reference=True) + catalog_id_reference = pywikibot.Claim( + REPO, catalog_pid, is_reference=True + ) catalog_id_reference.setTarget(catalog_id) reference_node.append(catalog_id_reference) log_buffer.append(f'({catalog_pid}, {catalog_id})') @@ -821,9 +877,7 @@ def _reference( claim.addSources(reference_node, summary=edit_summary) LOGGER.info('Added %s reference node', log_msg) except (APIError, Error,) as error: - LOGGER.warning( - 'Could not add %s reference node: %s', log_msg, error - ) + LOGGER.warning('Could not add %s reference node: %s', log_msg, error) def _delete_or_deprecate(action, qid, tid, catalog, catalog_pid) -> None: diff --git a/soweego/linker/baseline.py b/soweego/linker/baseline.py index 3ae0ab44..c67bd25f 100644 --- a/soweego/linker/baseline.py +++ b/soweego/linker/baseline.py @@ -272,9 +272,7 @@ def _handle_result( to_upload.add(statement) if upload: - wikidata_bot.add_people_statements( - catalog, to_upload, 'links', sandbox - ) + wikidata_bot.add_people_statements(catalog, to_upload, 'links', sandbox) LOGGER.info('%s %s dumped to %s', catalog, origin, path_out) diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py index ff967438..7ba5167d 100644 --- a/soweego/validator/checks.py +++ b/soweego/validator/checks.py @@ -20,10 +20,16 @@ import click from sqlalchemy.exc import SQLAlchemyError -from soweego.commons import constants, data_gathering, keys, target_database, text_utils +from soweego.commons import ( + constants, + data_gathering, + keys, + target_database, + text_utils, +) from soweego.commons.db_manager import DBManager from soweego.ingester import wikidata_bot -from soweego.wikidata import vocabulary, api_requests +from soweego.wikidata import api_requests, vocabulary from soweego.wikidata.api_requests import get_url_blacklist LOGGER = logging.getLogger(__name__) @@ -31,7 +37,9 @@ # File name templates # For all CLIs WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl' -IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json' +IDS_TO_BE_DEPRECATED_FNAME = ( + '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json' +) SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv' # For `dead_ids_cli` DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json' @@ -39,7 +47,9 @@ EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv' URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv' # For `bio_cli` -BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv' +BIO_STATEMENTS_TO_BE_ADDED_FNAME = ( + '{catalog}_{entity}_bio_statements_to_be_added.csv' +) @click.command() @@ -84,9 +94,10 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io): dir_io, DEAD_IDS_FNAME.format(catalog=catalog, entity=entity) ) wd_cache_path = os.path.join( - dir_io, WD_CACHE_FNAME.format( + dir_io, + WD_CACHE_FNAME.format( catalog=catalog, entity=entity, criterion='dead_ids' - ) + ), ) # Handle Wikidata cache @@ -190,34 +201,31 @@ def links_cli( criterion = 'links' # Output paths deprecate_path = os.path.join( - dir_io, IDS_TO_BE_DEPRECATED_FNAME.format( + dir_io, + IDS_TO_BE_DEPRECATED_FNAME.format( catalog=catalog, entity=entity, criterion=criterion - ) + ), ) add_ext_ids_path = os.path.join( - dir_io, EXT_IDS_FNAME.format( - catalog=catalog, entity=entity, task='added' - ) + dir_io, + EXT_IDS_FNAME.format(catalog=catalog, entity=entity, task='added'), ) add_urls_path = os.path.join( - dir_io, URLS_FNAME.format( - catalog=catalog, entity=entity, task='added' - ) + dir_io, URLS_FNAME.format(catalog=catalog, entity=entity, task='added') ) ref_ext_ids_path = os.path.join( - dir_io, EXT_IDS_FNAME.format( - catalog=catalog, entity=entity, task='referenced' - ) + dir_io, + EXT_IDS_FNAME.format(catalog=catalog, entity=entity, task='referenced'), ) ref_urls_path = os.path.join( - dir_io, URLS_FNAME.format( - catalog=catalog, entity=entity, task='referenced' - ) + dir_io, + URLS_FNAME.format(catalog=catalog, entity=entity, task='referenced'), ) wd_cache_path = os.path.join( - dir_io, WD_CACHE_FNAME.format( + dir_io, + WD_CACHE_FNAME.format( catalog=catalog, entity=entity, criterion=criterion - ) + ), ) # Handle Wikidata cache @@ -230,9 +238,14 @@ def links_cli( catalog, entity, blacklist, wd_cache=wd_cache ) else: - deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links( - catalog, entity, blacklist - ) + ( + deprecate, + add_ext_ids, + add_urls, + ref_ext_ids, + ref_urls, + wd_cache, + ) = links(catalog, entity, blacklist) # Nothing to do: the catalog doesn't contain links if deprecate is None: @@ -240,9 +253,13 @@ def links_cli( # Dump output files _dump_deprecated(deprecate, deprecate_path) - _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added') + _dump_csv_output( + add_ext_ids, add_ext_ids_path, 'third-party IDs to be added' + ) _dump_csv_output(add_urls, add_urls_path, 'URLs to be added') - _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced') + _dump_csv_output( + ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced' + ) _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced') # Dump Wikidata cache @@ -263,7 +280,7 @@ def links_cli( if sandbox: LOGGER.info( 'Running on the Wikidata sandbox item %s ...', - vocabulary.SANDBOX_2 + vocabulary.SANDBOX_2, ) LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers( @@ -277,7 +294,9 @@ def links_cli( wikidata_bot.add_people_statements( catalog, add_urls, criterion, sandbox ) - LOGGER.info('Starting referencing of shared external IDs in Wikidata ...') + LOGGER.info( + 'Starting referencing of shared external IDs in Wikidata ...' + ) wikidata_bot.add_people_statements( catalog, add_ext_ids, criterion, sandbox ) @@ -333,24 +352,26 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): """ criterion = 'bio' deprecate_path = os.path.join( - dir_io, IDS_TO_BE_DEPRECATED_FNAME.format( + dir_io, + IDS_TO_BE_DEPRECATED_FNAME.format( catalog=catalog, entity=entity, criterion=criterion - ) + ), ) add_path = os.path.join( - dir_io, BIO_STATEMENTS_TO_BE_ADDED_FNAME.format( - catalog=catalog, entity=entity - ) + dir_io, + BIO_STATEMENTS_TO_BE_ADDED_FNAME.format(catalog=catalog, entity=entity), ) ref_path = os.path.join( - dir_io, SHARED_STATEMENTS_FNAME.format( + dir_io, + SHARED_STATEMENTS_FNAME.format( catalog=catalog, entity=entity, criterion=criterion - ) + ), ) wd_cache_path = os.path.join( - dir_io, WD_CACHE_FNAME.format( + dir_io, + WD_CACHE_FNAME.format( catalog=catalog, entity=entity, criterion=criterion - ) + ), ) # Handle Wikidata cache @@ -380,7 +401,8 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): # version should be the most efficient solution pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL) LOGGER.info( - 'Biographical data gathered from Wikidata dumped to %s', wd_cache_path + 'Biographical data gathered from Wikidata dumped to %s', + wd_cache_path, ) except MemoryError: LOGGER.warning('Could not pickle the Wikidata cache: memory error') @@ -391,16 +413,14 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io): if sandbox: LOGGER.info( 'Running on the Wikidata sandbox item %s ...', - vocabulary.SANDBOX_2 + vocabulary.SANDBOX_2, ) LOGGER.info('Starting deprecation of %s IDs ...', catalog) wikidata_bot.delete_or_deprecate_identifiers( 'deprecate', catalog, entity, deprecate, sandbox ) LOGGER.info('Starting addition of extra statements to Wikidata ...') - wikidata_bot.add_people_statements( - catalog, add, criterion, sandbox - ) + wikidata_bot.add_people_statements(catalog, add, criterion, sandbox) LOGGER.info('Starting referencing of shared statements in Wikidata ...') wikidata_bot.add_people_statements( catalog, reference, criterion, sandbox @@ -488,7 +508,10 @@ def dead_ids( def links( catalog: str, entity: str, url_blacklist=False, wd_cache=None -) -> Union[Tuple[defaultdict, list, list, list, list, dict], Tuple[None, None, None, None, None, None]]: +) -> Union[ + Tuple[defaultdict, list, list, list, list, dict], + Tuple[None, None, None, None, None, None], +]: """Validate identifiers against available links. Also generate statements based on additional links @@ -533,7 +556,11 @@ def links( if target_links is None: return None, None, None, None, None, None - deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set) + deprecate, add, reference = ( + defaultdict(set), + defaultdict(set), + defaultdict(set), + ) # Wikidata side url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids() @@ -575,10 +602,13 @@ def links( 'URL statements to be added: %d', 'Third-party IDs to be referenced: %d. ' 'URL statements to be referenced: %d', - catalog, entity, + catalog, + entity, len(deprecate), - len(add_ext_ids), len(add_urls), - len(ref_ext_ids), len(ref_urls) + len(add_ext_ids), + len(add_urls), + len(ref_ext_ids), + len(ref_urls), ) return deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_links @@ -586,7 +616,9 @@ def links( def bio( catalog: str, entity: str, wd_cache=None -) -> Union[Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None]]: +) -> Union[ + Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None] +]: """Validate identifiers against available biographical data. Look for: @@ -631,7 +663,11 @@ def bio( if target_bio is None: return None, None, None, None - deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set) + deprecate, add, reference = ( + defaultdict(set), + defaultdict(set), + defaultdict(set), + ) # Wikidata side if wd_cache is None: @@ -647,13 +683,14 @@ def bio( wd_bio = wd_cache # Validation - _validate( - keys.BIODATA, - wd_bio, target_bio, - deprecate, add, reference - ) + _validate(keys.BIODATA, wd_bio, target_bio, deprecate, add, reference) - return deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), wd_bio + return ( + deprecate, + _bio_statements_generator(add), + _bio_statements_generator(reference), + wd_bio, + ) def _apply_url_blacklist(url_statements): @@ -665,9 +702,7 @@ def _apply_url_blacklist(url_statements): # Expected order of magnitude: n = 10^2; m = 10^5 for domain in blacklist: # 10^2 url_statements = list( # Slurp the filter or it won't work - filter( - lambda stmt: domain not in stmt[2], url_statements # 10^5 - ) + filter(lambda stmt: domain not in stmt[2], url_statements) # 10^5 ) LOGGER.info( @@ -775,10 +810,13 @@ def _compute_shared_and_extra(criterion, wd_data, target_data): # No cast to `set` because `wd_data` triples hold sets themselves wd_other = list(filter(lambda x: len(x) == 3, wd_data)) # In `target_data` we look for relevant date PIDs - target_dates = set(filter( - lambda x: x[0] in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH), - target_data - )) + target_dates = set( + filter( + lambda x: x[0] + in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH), + target_data, + ) + ) target_other = target_data.difference(target_dates) shared_dates, extra_dates = _compare('dates', wd_dates, target_dates) shared_other, extra_other = _compare('other', wd_other, target_other) @@ -820,8 +858,14 @@ def _compare(what, wd, target): continue inputs = ( - shared, extra, wd_matches, target_matches, - i, wd_elem, j, t_elem + shared, + extra, + wd_matches, + target_matches, + i, + wd_elem, + j, + t_elem, ) if what == 'dates': _compare_dates(inputs) diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py index fb98dc8f..8b1a5ad8 100644 --- a/soweego/wikidata/api_requests.py +++ b/soweego/wikidata/api_requests.py @@ -51,7 +51,7 @@ def resolve_qid(term: str, language='en') -> Optional[str]: 'action': 'wbsearchentities', 'format': 'json', 'search': term, - 'language': language + 'language': language, } response_body = _make_request(params) @@ -70,8 +70,7 @@ def resolve_qid(term: str, language='en') -> Optional[str]: # No search results except IndexError: LOGGER.info( - "No QIDs found for search term '%s' (language: %s)", - term, language + "No QIDs found for search term '%s' (language: %s)", term, language ) return None