From 31fab3ae1718a8dcd5ff328d434752428ed5137c Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Mon, 2 Aug 2021 15:43:30 +0000
Subject: [PATCH 01/22] update & refurbish CLI docstrings

---
 soweego/validator/checks.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 9a293e48..d0e96bdb 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -157,11 +157,14 @@ def links_cli(
 
     Dump 3 output files:
 
-    1. target identifiers to be deprecated. Format: (JSON) {identifier: [list of QIDs]}
+    1. catalog IDs to be deprecated. JSON format:
+    {catalog_ID: [list of QIDs]}
 
-    2. third-party identifiers to be added. Format: (CSV) QID,identifier_PID,identifier
+    2. third-party IDs to be added. CSV format:
+    QID,third-party_PID,third-party_ID,catalog_ID
 
-    3. URLs to be added. Format: (CSV) QID,P973,URL
+    3. URLs to be added. CSV format:
+    QID,P973,URL,catalog_ID
 
     You can pass the '-u' flag to upload the output to Wikidata.
 
@@ -257,9 +260,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
 
     Dump 2 output files:
 
-    1. target identifiers to be deprecated. Format: (JSON) {identifier: [list of QIDs]}
+    1. catalog IDs to be deprecated. JSON format:
+    {catalog_ID: [list of QIDs]}
 
-    2. statements to be added. Format: (CSV) QID,metadata_PID,value
+    2. statements to be added. CSV format:
+    QID,PID,value,catalog_ID
 
     You can pass the '-u' flag to upload the output to Wikidata.
     """

From 2d79ff2790e30aedcc72c031bcf781e335354733 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Wed, 4 Aug 2021 13:25:07 +0000
Subject: [PATCH 02/22] add record linkage item to reference non-machine
 learning bot edits

---
 soweego/wikidata/vocabulary.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/soweego/wikidata/vocabulary.py b/soweego/wikidata/vocabulary.py
index d990dc1e..7219e5fc 100644
--- a/soweego/wikidata/vocabulary.py
+++ b/soweego/wikidata/vocabulary.py
@@ -23,11 +23,15 @@
 INSTANCE_OF = 'P31'
 OCCUPATION = 'P106'
 
-# References nodes terms
+# References node terms
 # 'based on heuristic' was introduced upon community discussion
 # See https://github.com/Wikidata/soweego/issues/373
 BASED_ON_HEURISTIC = 'P887'
+# Main task: the linker uses machine learning
 ARTIFICIAL_INTELLIGENCE = 'Q11660'
+# Validator tasks: no machine learning
+# See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005)
+RECORD_LINKAGE = 'Q1266546'
 STATED_IN = 'P248'
 RETRIEVED = 'P813'
 

From 89528268b2efd48a9348db0df161e304258ddf2d Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Wed, 4 Aug 2021 13:26:52 +0000
Subject: [PATCH 03/22] [WIP] start refactoring to parametrize the P887
 heuristic

---
 soweego/ingester/wikidata_bot.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py
index 42c00386..b96742e8 100644
--- a/soweego/ingester/wikidata_bot.py
+++ b/soweego/ingester/wikidata_bot.py
@@ -61,11 +61,13 @@
 ###
 # Approved task 1: identifiers addition
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot
-IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]'
+IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] '
+'with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]'
 
 # Approved task 2: URL-based validation, criterion 2
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_2
-URL_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] with extra P887 and catalog ID reference'
+URL_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] '
+'with extra P887 and catalog ID reference'
 
 # Approved task 3: works by people
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_3
@@ -198,7 +200,7 @@ def people_cli(catalog, statements, sandbox):
 
     claim (Joey Ramone, member of, Ramones)
 
-    reference (based on heuristic, artificial intelligence),
+    reference (based on heuristic, record linkage),
               (stated in, Discogs),
               (Discogs artist ID, 264375),
               (retrieved, today)
@@ -789,17 +791,20 @@ def _add(
     )
 
 
-def _reference(claim, catalog_qid, person_pid, person_tid, summary=None):
+def _reference(
+    claim, catalog_qid, person_pid, person_tid, heuristic,
+    summary=None
+):
     # Reference node
     # create `pywikibot.Claim` instances at runtime:
     # pywikibot would cry if the same instances get uploaded multiple times
     # over the same item
-    # (based on heuristic, artificial intelligence) reference claim
+    # (based on heuristic, `heuristic`) reference claim: depends on the task
     based_on_heuristic_reference = pywikibot.Claim(
         REPO, vocabulary.BASED_ON_HEURISTIC, is_reference=True
     )
     based_on_heuristic_reference.setTarget(
-        pywikibot.ItemPage(REPO, vocabulary.ARTIFICIAL_INTELLIGENCE)
+        pywikibot.ItemPage(REPO, heuristic)
     )
     # (stated in, CATALOG) reference claim
     stated_in_reference = pywikibot.Claim(
@@ -814,7 +819,7 @@ def _reference(claim, catalog_qid, person_pid, person_tid, summary=None):
 
     if None in (person_pid, person_tid,):
         reference_log = (
-            f'({based_on_heuristic_reference.getID()}, {vocabulary.ARTIFICIAL_INTELLIGENCE}), '
+            f'({based_on_heuristic_reference.getID()}, {heuristic}), '
             f'({stated_in_reference.getID()}, {catalog_qid}), '
             f'({retrieved_reference.getID()}, {TODAY})'
         )
@@ -840,7 +845,7 @@ def _reference(claim, catalog_qid, person_pid, person_tid, summary=None):
         tid_reference.setTarget(person_tid)
 
         reference_log = (
-            f'({based_on_heuristic_reference.getID()}, {vocabulary.ARTIFICIAL_INTELLIGENCE}), '
+            f'({based_on_heuristic_reference.getID()}, {heuristic}), '
             f'({stated_in_reference.getID()}, {catalog_qid}), '
             f'({person_pid}, {person_tid}), '
             f'({retrieved_reference.getID()}, {TODAY})'

From 17d62362506cc4bad9c000424431241d5fd7e7c3 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Wed, 4 Aug 2021 13:31:51 +0000
Subject: [PATCH 04/22] close #406 ; fix WD cache loading bug; don't remove
 dates from the original data; ensure complete comparison of dates

---
 soweego/validator/checks.py | 88 +++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 43 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index d0e96bdb..ae0a4813 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -550,8 +550,8 @@ def _apply_url_blacklist(url_statements):
     # O(nm) complexity: n = len(blacklist); m = len(url_statements)
     # Expected order of magnitude: n = 10^2; m = 10^5
     for domain in blacklist:  # 10^2
-        url_statements = list(
-            filter(  # Slurp the filter or it won't work
+        url_statements = list(  # Slurp the filter or it won't work
+            filter(
                 lambda stmt: domain not in stmt[2], url_statements  # 10^5
             )
         )
@@ -564,9 +564,9 @@ def _apply_url_blacklist(url_statements):
 
 
 def _bio_to_be_added_generator(to_be_added):
-    for qid, values in to_be_added.items():
+    for (qid, tid,), values in to_be_added.items():
         for pid, value in values:
-            yield qid, pid, value
+            yield (qid, pid, value, tid,)
 
 
 def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
@@ -650,11 +650,11 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
 def _compute_shared_and_extra(criterion, wd_data, target_data):
     # Properly compare dates when checking biographical data
     if criterion == keys.BIODATA:
-        wd_dates = _extract_dates(wd_data)
-        target_dates = _extract_dates(target_data)
+        wd_dates, wd_other = _extract_dates(wd_data)
+        target_dates, target_other = _extract_dates(target_data)
         shared_dates, extra_dates = _compare_dates(wd_dates, target_dates)
-        shared = wd_data.intersection(target_data).union(shared_dates)
-        extra = target_data.difference(wd_data).union(extra_dates)
+        shared = wd_other.intersection(target_other).union(shared_dates)
+        extra = target_other.difference(wd_other).union(extra_dates)
     else:
         shared = wd_data.intersection(target_data)
         extra = target_data.difference(wd_data)
@@ -667,49 +667,50 @@ def _extract_dates(data):
     for pid, value in data:
         if pid in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH):
             dates.add((pid, value))
-    # Remove dates from input set
-    data.difference_update(dates)
-    return dates
+    # Separate dates from other data
+    return dates, data.difference(dates)
 
 
 def _compare_dates(wd, target):
+    # Ensure unique comparisons, regardless of different precisions.
+    # For instance:
+    # `wd` has '1986-01-01/9' and '1986-11-29/11'
+    # `target` has '1986-01-01/9'
+    # `shared_dates` will have one element
     shared_dates, extra_dates = set(), set()
 
-    for wd_elem, t_elem in zip_longest(wd, target):
-        # Skip pair with None elements
-        if None in (wd_elem, t_elem):
-            continue
-
-        wd_pid, wd_val = wd_elem
-        t_pid, t_val = t_elem
+    for wd_elem in wd:
+        for t_elem in target:
+            wd_pid, wd_val = wd_elem
+            t_pid, t_val = t_elem
 
-        # Don't compare birth with death dates
-        if wd_pid != t_pid:
-            continue
+            # Don't compare birth with death dates
+            if wd_pid != t_pid:
+                continue
 
-        # Skip unexpected None values
-        if None in (wd_val, t_val):
-            LOGGER.warning(
-                'Skipping unexpected %s date pair with missing value(s)',
-                (wd_elem, t_elem),
-            )
-            continue
+            # Skip unexpected `None` values
+            if None in (wd_val, t_val):
+                LOGGER.warning(
+                    'Skipping unexpected %s date pair with missing value(s)',
+                    (wd_elem, t_elem),
+                )
+                continue
 
-        wd_timestamp, wd_precision = wd_val.split('/')
-        t_timestamp, t_precision = t_val.split('/')
+            wd_timestamp, wd_precision = wd_val.split('/')
+            t_timestamp, t_precision = t_val.split('/')
 
-        shared_date, extra_date = _match_dates_by_precision(
-            min(int(wd_precision), int(t_precision)),
-            wd_elem,
-            wd_timestamp,
-            t_elem,
-            t_timestamp,
-        )
+            shared_date, extra_date = _match_dates_by_precision(
+                min(int(wd_precision), int(t_precision)),
+                wd_elem,
+                wd_timestamp,
+                t_elem,
+                t_timestamp,
+            )
 
-        if shared_date is not None:
-            shared_dates.add(shared_date)
-        if extra_date is not None:
-            extra_dates.add(extra_date)
+            if shared_date is not None:
+                shared_dates.add(shared_date)
+            if extra_date is not None:
+                extra_dates.add(extra_date)
 
     return shared_dates, extra_dates
 
@@ -740,6 +741,7 @@ def _match_dates_by_precision(
             (wd_timestamp, t_timestamp),
             (wd_simplified, t_simplified),
         )
+        # WD data has the priority
         shared = wd_elem
     else:
         LOGGER.debug('Target has an extra date: %s', t_timestamp)
@@ -791,12 +793,11 @@ def _dump_csv_output(data, outpath, log_msg_subject):
 
 def _load_wd_cache(file_handle):
     raw_cache = json.load(file_handle)
-    LOGGER.info("Loaded Wikidata cache from '%s'", file_handle.name)
     cache = {}
     for qid, data in raw_cache.items():
         for data_type, value_list in data.items():
             # Biodata has values that are a list
-            if isinstance(value_list[0], list):
+            if value_list and isinstance(value_list[0], list):
                 value_set = set()
                 for value in value_list:
                     if isinstance(value[1], list):
@@ -814,6 +815,7 @@ def _load_wd_cache(file_handle):
                     cache[qid][data_type] = set(value_list)
                 else:
                     cache[qid] = {data_type: set(value_list)}
+    LOGGER.info("Loaded Wikidata cache from '%s'", file_handle.name)
     return cache
 
 

From b35e1ecae1fc76fc3eff74a318c86ea607a892a1 Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Thu, 5 Aug 2021 16:12:03 +0200
Subject: [PATCH 05/22] [WIP] huge refactoring

---
 soweego/ingester/wikidata_bot.py | 557 ++++++++++++++-----------------
 1 file changed, 253 insertions(+), 304 deletions(-)

diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py
index b96742e8..3563251c 100644
--- a/soweego/ingester/wikidata_bot.py
+++ b/soweego/ingester/wikidata_bot.py
@@ -2,17 +2,17 @@
 # -*- coding: utf-8 -*-
 
 """A `Wikidata bot <https://www.wikidata.org/wiki/Wikidata:Bots>`_ that adds, deletes, or deprecates referenced statements.
-Here are typical output examples.
+Here are typical output examples:
 
 :func:`add_identifiers`
   | *Claim:* `Joey Ramone <https://www.wikidata.org/wiki/Q312387>`_, `Discogs artist ID <https://www.wikidata.org/wiki/Property:P1953>`_, `264375 <https://www.discogs.com/artist/264375>`_
-  | *Reference:* `stated in <https://www.wikidata.org/wiki/Property:P248>`_, `Discogs <https://www.wikidata.org/wiki/Q504063>`_), (`retrieved <https://www.wikidata.org/wiki/Property:P813>`_, TIMESTAMP
+  | *Reference:* (`based on heuristic <https://www.wikidata.org/wiki/Property:P887>`_, `artificial intelligence <https://www.wikidata.org/wiki/Q11660>`_), (`retrieved <https://www.wikidata.org/wiki/Property:P813>`_, TIMESTAMP)
 :func:`add_people_statements`
   | *Claim:* `Joey Ramone <https://www.wikidata.org/wiki/Q312387>`_, `member of <https://www.wikidata.org/wiki/Property:P463>`_, `Ramones <https://www.wikidata.org/wiki/Q483407>`_
-  | *Reference:* `stated in <https://www.wikidata.org/wiki/Property:P248>`_, `Discogs <https://www.wikidata.org/wiki/Q504063>`_), (`retrieved <https://www.wikidata.org/wiki/Property:P813>`_, TIMESTAMP
+  | *Reference:* (`based on heuristic <https://www.wikidata.org/wiki/Property:P887>`_, `record linkage <https://www.wikidata.org/wiki/Q1266546>`_),`(stated in <https://www.wikidata.org/wiki/Property:P248>`_, `Discogs <https://www.wikidata.org/wiki/Q504063>`_), (`Discogs artist ID <https://www.wikidata.org/wiki/Property:P1953>`_, `264375 <https://www.discogs.com/artist/264375>`_), (`retrieved <https://www.wikidata.org/wiki/Property:P813>`_, TIMESTAMP)
 :func:`add_works_statements`
   | *Claim:* `Leave Home <https://www.wikidata.org/wiki/Q1346637>`_, `performer <https://www.wikidata.org/wiki/Property:P175>`_, `Ramones <https://www.wikidata.org/wiki/Q483407>`_
-  | *Reference:* `stated in <https://www.wikidata.org/wiki/Property:P248>`_, `Discogs <https://www.wikidata.org/wiki/Q504063>`_), (`Discogs artist ID <https://www.wikidata.org/wiki/Property:P1953>`_, `264375 <https://www.discogs.com/artist/264375>`_), (`retrieved <https://www.wikidata.org/wiki/Property:P813>`_, TIMESTAMP
+  | *Reference:* (`based on heuristic <https://www.wikidata.org/wiki/Property:P887>`_, `record linkage <https://www.wikidata.org/wiki/Q1266546>`_),`(stated in <https://www.wikidata.org/wiki/Property:P248>`_, `Discogs <https://www.wikidata.org/wiki/Q504063>`_), (`Discogs artist ID <https://www.wikidata.org/wiki/Property:P1953>`_, `264375 <https://www.discogs.com/artist/264375>`_), (`retrieved <https://www.wikidata.org/wiki/Property:P813>`_, TIMESTAMP)
 :func:`delete_or_deprecate_identifiers`
   deletes or deprecates identifier statements.
 
@@ -20,9 +20,9 @@
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
-__version__ = '1.0'
+__version__ = '2.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, Hjfocs'
+__copyright__ = 'Copyleft 2021, Hjfocs'
 
 import csv
 import json
@@ -46,27 +46,17 @@
 SITE = pywikibot.Site('wikidata', 'wikidata')
 REPO = SITE.data_repository()
 
-# Time stamp object for the (retrieved, TIMESTAMP) reference
-TODAY = date.today()
-TIMESTAMP = pywikibot.WbTime(
-    site=REPO,
-    year=TODAY.year,
-    month=TODAY.month,
-    day=TODAY.day,
-    precision='day',
-)
-
-###
+#######################
 # BEGIN: Edit summaries
-###
+#######################
 # Approved task 1: identifiers addition
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot
 IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] '
 'with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]'
 
-# Approved task 2: URL-based validation, criterion 2
+# Approved task 2: URLs validation, criterion 2
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_2
-URL_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] '
+LINKS_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] '
 'with extra P887 and catalog ID reference'
 
 # Approved task 3: works by people
@@ -74,9 +64,23 @@
 WORKS_SUMMARY = (
     '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_3|bot task 3]]'
 )
-###
+
+# Biographical data validation, criterion 3
+# TODO add wikilink once the bot task gets approved
+BIO_VALIDATION_SUMMARY = 'bot task 4'
+#####################
 # END: Edit summaries
-###
+#####################
+
+# Time stamp object for the (retrieved, TIMESTAMP) reference
+TODAY = date.today()
+TIMESTAMP = pywikibot.WbTime(
+    site=REPO,
+    year=TODAY.year,
+    month=TODAY.month,
+    day=TODAY.day,
+    precision='day',
+)
 
 # We also support Twitter
 SUPPORTED_TARGETS = target_database.supported_targets() ^ {TWITTER}
@@ -176,13 +180,20 @@ def identifiers_cli(catalog, entity, identifiers, sandbox):
 @click.command()
 @click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS))
 @click.argument('statements', type=click.File())
+@click.option(
+    '-c',
+    '--criterion',
+    type=click.Choice(('links', 'bio')),
+    help='Validation criterion used to generate STATEMENTS. '
+         'Same as the command passed to `python -m soweego sync`'
+)
 @click.option(
     '-s',
     '--sandbox',
     is_flag=True,
     help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
-def people_cli(catalog, statements, sandbox):
+def people_cli(catalog, statements, criterion, sandbox):
     """Add statements to Wikidata people.
 
     STATEMENTS must be a CSV file.
@@ -205,37 +216,35 @@ def people_cli(catalog, statements, sandbox):
               (Discogs artist ID, 264375),
               (retrieved, today)
     """
+    sandbox_item = vocabulary.SANDBOX_2
+    # See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005)
+    heuristic = vocabulary.RECORD_LINKAGE
     catalog_qid = target_database.get_catalog_qid(catalog)
-    person_pid = target_database.get_person_pid(catalog)
+    catalog_pid = target_database.get_person_pid(catalog)
+
+    if criterion == 'links':
+        edit_summary = LINKS_VALIDATION_SUMMARY
+    elif criterion == 'bio':
+        edit_summary = BIO_VALIDATION_SUMMARY
+    else:
+        edit_summary = None
 
     if sandbox:
         LOGGER.info(
-            'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2
+            'Running on the Wikidata sandbox item %s ...', sandbox_item
         )
 
     stmt_reader = csv.reader(statements)
-    for statement in stmt_reader:
-        person, predicate, value, person_tid = statement
-        if sandbox:
-            _add_or_reference(
-                vocabulary.SANDBOX_2,
-                predicate,
-                value,
-                catalog_qid,
-                person_pid,
-                person_tid,
-                summary=URL_VALIDATION_SUMMARY,
-            )
-        else:
-            _add_or_reference(
-                person,
-                predicate,
-                value,
-                catalog_qid,
-                person_pid,
-                person_tid,
-                summary=URL_VALIDATION_SUMMARY,
-            )
+    for person, predicate, value, catalog_id in stmt_reader:
+        subject = person if not sandbox else sandbox_item
+        _add_or_reference(
+            (subject, predicate, value),
+            heuristic,
+            catalog_qid=catalog_qid,
+            catalog_pid=catalog_pid,
+            catalog_id=catalog_id,
+            edit_summary=edit_summary
+        )
 
 
 @click.command()
@@ -245,7 +254,7 @@ def people_cli(catalog, statements, sandbox):
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all edits on the Wikidata sandbox item Q4115189.',
+    help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 def works_cli(catalog, statements, sandbox):
     """Add statements to Wikidata works.
@@ -265,37 +274,26 @@ def works_cli(catalog, statements, sandbox):
 
     claim (C'mon Everybody, performer, Eddie Cochran)
 
-    reference (based on heuristic, artificial intelligence),
+    reference (based on heuristic, record linkage),
               (Discogs artist ID, 139984), (retrieved, today)
     """
+    sandbox_item = vocabulary.SANDBOX_2
+    catalog_qid = target_database.get_catalog_qid(catalog)
     is_imdb, person_pid = _get_works_args(catalog)
+    heuristic = vocabulary.RECORD_LINKAGE
 
     if sandbox:
-        LOGGER.info('Running on the Wikidata sandbox item ...')
+        LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item)
 
     stmt_reader = csv.reader(statements)
-    for statement in stmt_reader:
-        work, predicate, person, person_tid = statement
-        if sandbox:
-            _add_or_reference_works(
-                vocabulary.SANDBOX_1,
-                predicate,
-                person,
-                person_pid,
-                person_tid,
-                is_imdb=is_imdb,
-                summary=WORKS_SUMMARY,
-            )
-        else:
-            _add_or_reference_works(
-                work,
-                predicate,
-                person,
-                person_pid,
-                person_tid,
-                is_imdb=is_imdb,
-                summary=WORKS_SUMMARY,
-            )
+    for work, predicate, person, person_id in stmt_reader:
+        subject = work if not sandbox else sandbox_item
+        _add_or_reference_works(
+            (subject, predicate, person),
+            heuristic,
+            catalog_qid, person_pid, person_id,
+            is_imdb=is_imdb, edit_summary=WORKS_SUMMARY
+        )
 
 
 def add_identifiers(
@@ -312,27 +310,23 @@ def add_identifiers(
     :param sandbox: whether to perform edits on the
       `Wikidata sandbox <https://www.wikidata.org/wiki/Q4115189>`_ item
     """
+    sandbox_item = vocabulary.SANDBOX_2
     catalog_pid = target_database.get_catalog_pid(catalog, entity)
+    heuristic = vocabulary.ARTIFICIAL_INTELLIGENCE
+
+    if sandbox:
+        LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item)
+
     for qid, tid in identifiers.items():
         LOGGER.info('Processing %s match: %s -> %s', catalog, qid, tid)
-        if sandbox:
-            LOGGER.debug(
-                'Using Wikidata sandbox item %s as subject, instead of %s',
-                vocabulary.SANDBOX_1,
-                qid,
-            )
-            _add_or_reference(
-                vocabulary.SANDBOX_1,
-                catalog_pid,
-                tid,
-                summary=IDENTIFIERS_SUMMARY,
-            )
-        else:
-            _add_or_reference(
-                qid, catalog_pid, tid, summary=IDENTIFIERS_SUMMARY
-            )
+        subject = qid if not sandbox else sandbox_item
+        _add_or_reference(
+            (subject, catalog_pid, tid,),
+            heuristic,
+            edit_summary=IDENTIFIERS_SUMMARY)
 
 
+# TODO handle edit summary
 def add_people_statements(
     catalog: str, statements: Iterable, sandbox: bool
 ) -> None:
@@ -343,39 +337,34 @@ def add_people_statements(
     :func:`soweego.validator.checks.bio`.
 
     :param statements: iterable of
-      (subject, predicate, value, target ID) tuples
+      (subject, predicate, value, catalog ID) tuples
     :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
       A supported catalog
     :param sandbox: whether to perform edits on the
       `Wikidata sandbox <https://www.wikidata.org/wiki/Q13406268>`_ item
     """
+    sandbox_item = vocabulary.SANDBOX_2
     catalog_qid = target_database.get_catalog_qid(catalog)
     person_pid = target_database.get_person_pid(catalog)
+    heuristic = vocabulary.RECORD_LINKAGE
+
+    if sandbox:
+        LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item)
 
-    for subject, predicate, value, person_tid in statements:
+    for subject, predicate, value, catalog_id in statements:
         LOGGER.info(
-            'Processing (%s, %s, %s) statement', subject, predicate, value
+            'Processing (%s, %s, %s, %s) statement ...',
+            subject, predicate, value, catalog_id
+        )
+        actual_subject = subject if not sandbox else sandbox_item
+        _add_or_reference(
+            (actual_subject, predicate, value),
+            heuristic,
+            catalog_qid=catalog_qid,
+            catalog_pid=person_pid,
+            catalog_id=catalog_id,
+            edit_summary=LINKS_VALIDATION_SUMMARY
         )
-        if sandbox:
-            _add_or_reference(
-                vocabulary.SANDBOX_2,
-                predicate,
-                value,
-                catalog_qid,
-                person_pid,
-                person_tid,
-                summary=URL_VALIDATION_SUMMARY,
-            )
-        else:
-            _add_or_reference(
-                subject,
-                predicate,
-                value,
-                catalog_qid,
-                person_pid,
-                person_tid,
-                summary=URL_VALIDATION_SUMMARY,
-            )
 
 
 def add_works_statements(
@@ -393,32 +382,29 @@ def add_works_statements(
     :param sandbox: whether to perform edits on the
       `Wikidata sandbox <https://www.wikidata.org/wiki/Q4115189>`_ item
     """
+    sandbox_item = vocabulary.SANDBOX_2
+    catalog_qid = target_database.get_catalog_qid(catalog)
     is_imdb, person_pid = _get_works_args(catalog)
+    heuristic = vocabulary.RECORD_LINKAGE
+
+    if sandbox:
+        LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item)
 
-    for work, predicate, person, person_tid in statements:
+    for work, predicate, person, person_id in statements:
         LOGGER.info(
-            'Processing (%s, %s, %s) statement', work, predicate, person
+            'Processing (%s, %s, %s, %s) statement',
+            work, predicate, person, person_id
+        )
+        subject = work if not sandbox else sandbox_item
+        _add_or_reference_works(
+            (subject, predicate, person),
+            heuristic,
+            catalog_qid,
+            person_pid,
+            person_id,
+            is_imdb=is_imdb,
+            edit_summary=WORKS_SUMMARY
         )
-        if sandbox:
-            _add_or_reference_works(
-                vocabulary.SANDBOX_1,
-                predicate,
-                person,
-                person_pid,
-                person_tid,
-                is_imdb=is_imdb,
-                summary=WORKS_SUMMARY,
-            )
-        else:
-            _add_or_reference_works(
-                work,
-                predicate,
-                person,
-                person_pid,
-                person_tid,
-                is_imdb=is_imdb,
-                summary=WORKS_SUMMARY,
-            )
 
 
 def delete_or_deprecate_identifiers(
@@ -459,35 +445,26 @@ def delete_or_deprecate_identifiers(
                 _delete_or_deprecate(action, qid, tid, catalog, catalog_pid)
 
 
-def _add_or_reference_works(
-    work: str,
-    predicate: str,
-    person: str,
-    person_pid: str,
-    person_tid: str,
-    is_imdb=False,
-    summary=None,
-) -> None:
+def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, catalog_pid: str, catalog_id: str,
+                            is_imdb=False, edit_summary=None) -> None:
+    work, predicate, person = statement
     # Parse value into an item in case of QID
     qid = match(QID_REGEX, person)
     if not qid:
         LOGGER.warning(
-            "%s doesn't look like a QID, won't try to add the (%s, %s, %s) statement",
-            person,
-            work,
-            predicate,
-            person,
+            "%s doesn't look like a QID, won't try to add the %s statement",
+            person, statement
         )
         return
-    person = pywikibot.ItemPage(REPO, qid.group())
+    person_item = pywikibot.ItemPage(REPO, qid.group())
 
     subject_item, claims = _essential_checks(
-        work,
-        predicate,
-        person,
-        person_pid=person_pid,
-        person_tid=person_tid,
-        summary=summary,
+        (work, predicate, person_item),
+        heuristic,
+        catalog_qid=catalog_qid,
+        catalog_pid=catalog_pid,
+        catalog_id=catalog_id,
+        edit_summary=edit_summary,
     )
     if None in (subject_item, claims):
         return
@@ -497,12 +474,12 @@ def _add_or_reference_works(
         for pred in vocabulary.MOVIE_PIDS:
             if _check_for_same_value(
                 claims,
-                work,
-                pred,
-                person,
-                person_pid=person_pid,
-                person_tid=person_tid,
-                summary=summary,
+                (work, pred, person_item),
+                heuristic,
+                catalog_qid=catalog_qid,
+                catalog_pid=catalog_pid,
+                catalog_id=catalog_id,
+                edit_summary=edit_summary
             ):
                 return
 
@@ -510,30 +487,24 @@ def _add_or_reference_works(
         claims,
         subject_item,
         predicate,
-        person,
-        person_pid=person_pid,
-        person_tid=person_tid,
-        summary=summary,
+        person_item,
+        heuristic,
+        catalog_pid=catalog_pid,
+        catalog_id=catalog_id,
+        edit_summary=edit_summary,
     )
 
 
 def _add_or_reference(
-    subject: str,
-    predicate: str,
-    value: str,
-    catalog_qid: str,
-    person_pid: str,
-    person_tid: str,
-    summary=None,
+        statement, heuristic,
+        catalog_qid=None, catalog_pid=None, catalog_id=None,
+        edit_summary=None
 ) -> None:
+    subject, predicate, value = statement
     subject_item, claims = _essential_checks(
-        subject,
-        predicate,
-        value,
-        catalog_qid,
-        person_pid=person_pid,
-        person_tid=person_tid,
-        summary=summary,
+        statement, heuristic,
+        catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id,
+        edit_summary=edit_summary
     )
 
     if None in (subject_item, claims):
@@ -545,11 +516,12 @@ def _add_or_reference(
     # See https://www.wikidata.org/wiki/User_talk:Jura1#Thanks_for_your_feedback_on_User:Soweego_bot_task_2
     if _check_for_same_value(
         claims,
-        subject,
-        vocabulary.OFFICIAL_WEBSITE,
-        value,
-        catalog_qid,
-        summary=summary,
+        (subject, vocabulary.OFFICIAL_WEBSITE, value,),
+        heuristic,
+        edit_summary=edit_summary,
+        catalog_qid=catalog_qid,
+        catalog_pid=catalog_pid,
+        catalog_id=catalog_id
     ):
         return
 
@@ -565,11 +537,12 @@ def _add_or_reference(
         subject_item,
         predicate,
         value,
-        catalog_qid,
+        heuristic,
         case_insensitive=case_insensitive,
-        person_pid=person_pid,
-        person_tid=person_tid,
-        summary=summary,
+        catalog_qid=catalog_qid,
+        catalog_pid=catalog_pid,
+        catalog_id=catalog_id,
+        edit_summary=edit_summary
     )
 
 
@@ -578,11 +551,12 @@ def _handle_addition(
     subject_item,
     predicate,
     value,
-    catalog_qid,
+    heuristic,
     case_insensitive=False,
-    person_pid=None,
-    person_tid=None,
-    summary=None,
+    catalog_qid=None,
+    catalog_pid=None,
+    catalog_id=None,
+    edit_summary=None,
 ):
     given_predicate_claims = claims.get(predicate)
     subject_qid = subject_item.getID()
@@ -591,13 +565,12 @@ def _handle_addition(
     if not given_predicate_claims:
         LOGGER.debug('%s has no %s claim', subject_qid, predicate)
         _add(
-            subject_item,
-            predicate,
-            value,
-            catalog_qid,
-            person_pid,
-            person_tid,
-            summary=summary,
+            subject_item, predicate, value,
+            heuristic,
+            catalog_qid=catalog_qid,
+            catalog_pid=catalog_pid,
+            catalog_id=catalog_id,
+            edit_summary=edit_summary
         )
         return
 
@@ -620,13 +593,12 @@ def _handle_addition(
             '%s has no %s claim with value %s', subject_qid, predicate, value
         )
         _add(
-            subject_item,
-            predicate,
-            value,
-            catalog_qid,
-            person_pid,
-            person_tid,
-            summary=summary,
+            subject_item, predicate, value,
+            heuristic,
+            catalog_qid=catalog_qid,
+            catalog_pid=catalog_pid,
+            catalog_id=catalog_id,
+            edit_summary=edit_summary
         )
         return
 
@@ -637,16 +609,12 @@ def _handle_addition(
     if case_insensitive:
         for claim in given_predicate_claims:
             if claim.getTarget().lower() == value:
-                _reference(
-                    claim, catalog_qid, person_pid, person_tid, summary=summary
-                )
+                _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary)
                 return
 
     for claim in given_predicate_claims:
         if claim.getTarget() == value:
-            _reference(
-                claim, catalog_qid, person_pid, person_tid, summary=summary
-            )
+            _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary)
 
 
 def _handle_redirect_and_dead(qid):
@@ -665,14 +633,14 @@ def _handle_redirect_and_dead(qid):
 
 
 def _essential_checks(
-    subject,
-    predicate,
-    value,
-    catalog_qid,
-    person_pid=None,
-    person_tid=None,
-    summary=None,
+    statement: tuple,
+    heuristic: str,
+    catalog_qid=None,
+    catalog_pid=None,
+    catalog_id=None,
+    edit_summary=None,
 ):
+    subject, predicate, value = statement
     item, data = _handle_redirect_and_dead(subject)
 
     if item is None and data is None:
@@ -682,13 +650,11 @@ def _essential_checks(
     if not data:
         LOGGER.warning('%s has no data at all', subject)
         _add(
-            item,
-            predicate,
-            value,
-            catalog_qid,
-            person_pid,
-            person_tid,
-            summary=summary,
+            item, predicate, value, heuristic,
+            catalog_qid=catalog_qid,
+            catalog_pid=catalog_pid,
+            catalog_id=catalog_id,
+            edit_summary=edit_summary
         )
         return None, None
 
@@ -697,13 +663,11 @@ def _essential_checks(
     if not claims:
         LOGGER.warning('%s has no claims', subject)
         _add(
-            item,
-            predicate,
-            value,
-            catalog_qid,
-            person_pid,
-            person_tid,
-            summary=summary,
+            item, predicate, value, heuristic,
+            catalog_qid=catalog_qid,
+            catalog_pid=catalog_pid,
+            catalog_id=catalog_id,
+            edit_summary=edit_summary
         )
         return None, None
 
@@ -712,14 +676,14 @@ def _essential_checks(
 
 def _check_for_same_value(
     subject_claims,
-    subject,
-    predicate,
-    value,
-    catalog_qid,
-    person_pid=None,
-    person_tid=None,
-    summary=None,
+    statement,
+    heuristic,
+    edit_summary=None,
+    catalog_qid=None,
+    catalog_pid=None,
+    catalog_id=None,
 ):
+    subject, predicate, value = statement
     given_predicate_claims = subject_claims.get(predicate)
     if given_predicate_claims:
         for claim in given_predicate_claims:
@@ -731,7 +695,11 @@ def _check_for_same_value(
                     value,
                 )
                 _reference(
-                    claim, catalog_qid, person_pid, person_tid, summary=summary
+                    claim, heuristic,
+                    catalog_qid=catalog_qid,
+                    catalog_pid=catalog_pid,
+                    catalog_id=catalog_id,
+                    edit_summary=edit_summary
                 )
                 return True
     return False
@@ -776,98 +744,79 @@ def _add(
     subject_item,
     predicate,
     value,
-    catalog_qid,
-    person_pid,
-    person_tid,
-    summary=None,
+    heuristic,
+    catalog_qid=None,
+    catalog_pid=None,
+    catalog_id=None,
+    edit_summary=None,
 ):
     claim = pywikibot.Claim(REPO, predicate)
     claim.setTarget(value)
-    subject_item.addClaim(claim, summary=summary)
+    subject_item.addClaim(claim, summary=edit_summary)
     LOGGER.debug('Added claim: %s', claim.toJSON())
-    _reference(claim, catalog_qid, person_pid, person_tid, summary=summary)
+    _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary)
     LOGGER.info(
         'Added (%s, %s, %s) statement', subject_item.getID(), predicate, value
     )
 
 
 def _reference(
-    claim, catalog_qid, person_pid, person_tid, heuristic,
-    summary=None
+    claim: pywikibot.Claim, heuristic: str,
+    catalog_qid=None, catalog_pid=None, catalog_id=None, edit_summary=None
 ):
-    # Reference node
-    # create `pywikibot.Claim` instances at runtime:
+    reference_node, log_buffer = [], []
+
+    # Create `pywikibot.Claim` instances at runtime:
     # pywikibot would cry if the same instances get uploaded multiple times
     # over the same item
-    # (based on heuristic, `heuristic`) reference claim: depends on the task
+
+    # Depends on the bot task
+    # (based on heuristic, `heuristic`) reference claim
     based_on_heuristic_reference = pywikibot.Claim(
         REPO, vocabulary.BASED_ON_HEURISTIC, is_reference=True
     )
     based_on_heuristic_reference.setTarget(
         pywikibot.ItemPage(REPO, heuristic)
     )
-    # (stated in, CATALOG) reference claim
-    stated_in_reference = pywikibot.Claim(
-        REPO, vocabulary.STATED_IN, is_reference=True
-    )
-    stated_in_reference.setTarget(pywikibot.ItemPage(REPO, catalog_qid))
+    reference_node.append(based_on_heuristic_reference)
+    log_buffer.append(f'({based_on_heuristic_reference.getID()}, {heuristic})')
+
+    # Validator tasks only
+    if catalog_qid is not None:
+        # (stated in, CATALOG) reference claim
+        stated_in_reference = pywikibot.Claim(
+            REPO, vocabulary.STATED_IN, is_reference=True
+        )
+        stated_in_reference.setTarget(pywikibot.ItemPage(REPO, catalog_qid))
+        reference_node.append(stated_in_reference)
+        log_buffer.append(f'({stated_in_reference.getID()}, {catalog_qid})')
+
+    if catalog_pid is not None and catalog_id is not None:
+        # (catalog property, catalog ID) reference claim
+        catalog_id_reference = pywikibot.Claim(REPO, catalog_pid, is_reference=True)
+        catalog_id_reference.setTarget(catalog_id)
+        reference_node.append(catalog_id_reference)
+        log_buffer.append(f'({catalog_pid}, {catalog_id})')
+
+    # All tasks
     # (retrieved, TODAY) reference claim
     retrieved_reference = pywikibot.Claim(
         REPO, vocabulary.RETRIEVED, is_reference=True
     )
     retrieved_reference.setTarget(TIMESTAMP)
+    reference_node.append(retrieved_reference)
+    log_buffer.append(f'({retrieved_reference.getID()}, {TODAY})')
 
-    if None in (person_pid, person_tid,):
-        reference_log = (
-            f'({based_on_heuristic_reference.getID()}, {heuristic}), '
-            f'({stated_in_reference.getID()}, {catalog_qid}), '
-            f'({retrieved_reference.getID()}, {TODAY})'
-        )
-
-        try:
-            claim.addSources(
-                [
-                    based_on_heuristic_reference,
-                    stated_in_reference,
-                    retrieved_reference,
-                ],
-                summary=summary,
-            )
+    log_msg = ', '.join(log_buffer)
 
-            LOGGER.info('Added %s reference node', reference_log)
-        except (APIError, Error,) as error:
-            LOGGER.warning(
-                'Could not add %s reference node: %s', reference_log, error
-            )
-    else:
-        # (catalog property, catalog_ID) reference claim
-        tid_reference = pywikibot.Claim(REPO, person_pid, is_reference=True)
-        tid_reference.setTarget(person_tid)
-
-        reference_log = (
-            f'({based_on_heuristic_reference.getID()}, {heuristic}), '
-            f'({stated_in_reference.getID()}, {catalog_qid}), '
-            f'({person_pid}, {person_tid}), '
-            f'({retrieved_reference.getID()}, {TODAY})'
+    try:
+        claim.addSources(reference_node, summary=edit_summary)
+        LOGGER.info('Added %s reference node', log_msg)
+    except (APIError, Error,) as error:
+        LOGGER.warning(
+            'Could not add %s reference node: %s', log_msg, error
         )
 
-        try:
-            claim.addSources(
-                [
-                    based_on_heuristic_reference,
-                    stated_in_reference,
-                    tid_reference,
-                    retrieved_reference,
-                ],
-                summary=summary,
-            )
-
-            LOGGER.info('Added %s reference node', reference_log)
-        except (APIError, Error,) as error:
-            LOGGER.warning(
-                'Could not add %s reference node: %s', reference_log, error
-            )
-
 
 def _delete_or_deprecate(action, qid, tid, catalog, catalog_pid) -> None:
     item, data = _handle_redirect_and_dead(qid)

From 41d4d3042c7fa167ac0a4942e378a140ae99ca7b Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Fri, 6 Aug 2021 15:00:06 +0000
Subject: [PATCH 06/22] pass catalog QID to works; improve CLI help; use
 sandbox 2; simplify code

---
 soweego/ingester/wikidata_bot.py | 48 +++++++++++++++-----------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py
index 3563251c..f8ac8c80 100644
--- a/soweego/ingester/wikidata_bot.py
+++ b/soweego/ingester/wikidata_bot.py
@@ -51,18 +51,24 @@
 #######################
 # Approved task 1: identifiers addition
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot
-IDENTIFIERS_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] '
-'with P887 reference, see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]'
+IDENTIFIERS_SUMMARY = (
+    '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot|bot task 1]] '
+    'with P887 reference, '
+    'see [[Topic:V6cc1thgo09otfw5#flow-post-v7i05rpdja1b3wzk|discussion]]'
+)
 
 # Approved task 2: URLs validation, criterion 2
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_2
-LINKS_VALIDATION_SUMMARY = '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] '
-'with extra P887 and catalog ID reference'
+LINKS_VALIDATION_SUMMARY = (
+    '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_2|bot task 2]] '
+    'with extra P887 and catalog ID reference'
+)
 
 # Approved task 3: works by people
 # https://www.wikidata.org/wiki/Wikidata:Requests_for_permissions/Bot/Soweego_bot_3
 WORKS_SUMMARY = (
-    '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_3|bot task 3]]'
+    '[[Wikidata:Requests_for_permissions/Bot/Soweego_bot_3|bot task 3]] '
+    'with extra P887 reference'
 )
 
 # Biographical data validation, criterion 3
@@ -122,7 +128,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox):
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all edits on the Wikidata sandbox item Q4115189.',
+    help='Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 def deprecate_cli(catalog, entity, invalid_identifiers, sandbox):
     """Deprecate invalid identifiers.
@@ -148,7 +154,7 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox):
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all edits on the Wikidata sandbox item Q4115189.',
+    help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 def identifiers_cli(catalog, entity, identifiers, sandbox):
     """Add identifiers.
@@ -168,12 +174,8 @@ def identifiers_cli(catalog, entity, identifiers, sandbox):
 
     claim (Richard Hell, Discogs artist ID, 266995)
 
-    reference (based on heuristic, artificial intelligence),
-              (retrieved, today)
+    reference (based on heuristic, artificial intelligence), (retrieved, today)
     """
-    if sandbox:
-        LOGGER.info('Running on the Wikidata sandbox item ...')
-
     add_identifiers(json.load(identifiers), catalog, entity, sandbox)
 
 
@@ -211,10 +213,7 @@ def people_cli(catalog, statements, criterion, sandbox):
 
     claim (Joey Ramone, member of, Ramones)
 
-    reference (based on heuristic, record linkage),
-              (stated in, Discogs),
-              (Discogs artist ID, 264375),
-              (retrieved, today)
+    reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 264375), (retrieved, today)
     """
     sandbox_item = vocabulary.SANDBOX_2
     # See https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/07#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005)
@@ -274,8 +273,7 @@ def works_cli(catalog, statements, sandbox):
 
     claim (C'mon Everybody, performer, Eddie Cochran)
 
-    reference (based on heuristic, record linkage),
-              (Discogs artist ID, 139984), (retrieved, today)
+    reference (based on heuristic, record linkage), (stated in, Discogs), (Discogs artist ID, 139984), (retrieved, today)
     """
     sandbox_item = vocabulary.SANDBOX_2
     catalog_qid = target_database.get_catalog_qid(catalog)
@@ -308,7 +306,7 @@ def add_identifiers(
       'writer', 'audiovisual_work', 'musical_work'}``.
       A supported entity
     :param sandbox: whether to perform edits on the
-      `Wikidata sandbox <https://www.wikidata.org/wiki/Q4115189>`_ item
+      `Wikidata sandbox 2 <https://www.wikidata.org/wiki/Q13406268>`_ item
     """
     sandbox_item = vocabulary.SANDBOX_2
     catalog_pid = target_database.get_catalog_pid(catalog, entity)
@@ -428,21 +426,18 @@ def delete_or_deprecate_identifiers(
       A supported entity
     :param invalid: a ``{invalid_catalog_identifier: [list of QIDs]}`` dictionary
     :param sandbox: whether to perform edits on the
-      `Wikidata sandbox <https://www.wikidata.org/wiki/Q4115189>`_ item
+      `Wikidata sandbox 2 <https://www.wikidata.org/wiki/Q13406268>`_ item
     """
+    sandbox_item = vocabulary.SANDBOX_2
     catalog_pid = target_database.get_catalog_pid(catalog, entity)
 
     for tid, qids in invalid.items():
         for qid in qids:
+            actual_qid = qid if not sandbox else sandbox_item
             LOGGER.info(
                 'Will %s %s identifier: %s -> %s', action, catalog, tid, qid
             )
-            if sandbox:
-                _delete_or_deprecate(
-                    action, vocabulary.SANDBOX_1, tid, catalog, catalog_pid
-                )
-            else:
-                _delete_or_deprecate(action, qid, tid, catalog, catalog_pid)
+            _delete_or_deprecate(action, actual_qid, tid, catalog, catalog_pid)
 
 
 def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, catalog_pid: str, catalog_id: str,
@@ -489,6 +484,7 @@ def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str,
         predicate,
         person_item,
         heuristic,
+        catalog_qid=catalog_qid,
         catalog_pid=catalog_pid,
         catalog_id=catalog_id,
         edit_summary=edit_summary,

From cb814da21e2f1be831029f4324a32dbf97c3a70d Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Mon, 9 Aug 2021 16:21:23 +0200
Subject: [PATCH 07/22] handle edit summary in public function; use sandbox
 item 2 everywhere & update docstrings

---
 soweego/ingester/wikidata_bot.py | 34 +++++++++++++++++++++++---------
 soweego/linker/baseline.py       |  4 +++-
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py
index f8ac8c80..882a9b87 100644
--- a/soweego/ingester/wikidata_bot.py
+++ b/soweego/ingester/wikidata_bot.py
@@ -102,7 +102,7 @@
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all edits on the Wikidata sandbox item Q4115189.',
+    help=f'Perform all edits on the Wikidata sandbox 2 item {vocabulary.SANDBOX_2}.',
 )
 def delete_cli(catalog, entity, invalid_identifiers, sandbox):
     """Delete invalid identifiers.
@@ -111,7 +111,10 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox):
     Format: { catalog_identifier: [ list of QIDs ] }
     """
     if sandbox:
-        LOGGER.info('Running on the Wikidata sandbox item ...')
+        LOGGER.info(
+            'Running on the Wikidata sandbox item %s ...',
+            vocabulary.SANDBOX_2
+        )
 
     delete_or_deprecate_identifiers(
         'delete', catalog, entity, json.load(invalid_identifiers), sandbox
@@ -128,7 +131,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox):
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
+    help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 def deprecate_cli(catalog, entity, invalid_identifiers, sandbox):
     """Deprecate invalid identifiers.
@@ -137,7 +140,10 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox):
     Format: { catalog_identifier: [ list of QIDs ] }
     """
     if sandbox:
-        LOGGER.info('Running on the Wikidata sandbox item ...')
+        LOGGER.info(
+            'Running on the Wikidata sandbox item %s ...',
+            vocabulary.SANDBOX_2
+        )
 
     delete_or_deprecate_identifiers(
         'deprecate', catalog, entity, json.load(invalid_identifiers), sandbox
@@ -324,9 +330,8 @@ def add_identifiers(
             edit_summary=IDENTIFIERS_SUMMARY)
 
 
-# TODO handle edit summary
 def add_people_statements(
-    catalog: str, statements: Iterable, sandbox: bool
+    catalog: str, statements: Iterable, criterion: str, sandbox: bool
 ) -> None:
     """Add statements to existing Wikidata people.
 
@@ -334,13 +339,24 @@ def add_people_statements(
     as per :func:`soweego.validator.checks.links` and
     :func:`soweego.validator.checks.bio`.
 
-    :param statements: iterable of
-      (subject, predicate, value, catalog ID) tuples
     :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
       A supported catalog
+    :param statements: iterable of
+      (subject, predicate, value, catalog ID) tuples
+    :param criterion: ``{'links', 'bio'}``. A supported validation criterion
     :param sandbox: whether to perform edits on the
       `Wikidata sandbox <https://www.wikidata.org/wiki/Q13406268>`_ item
     """
+    if criterion == 'links':
+        edit_summary = LINKS_VALIDATION_SUMMARY
+    elif criterion == 'bio':
+        edit_summary = BIO_VALIDATION_SUMMARY
+    else:
+        raise ValueError(
+            f"Invalid criterion: '{criterion}'. "
+            "Please use either 'links' or 'bio'"
+        )
+
     sandbox_item = vocabulary.SANDBOX_2
     catalog_qid = target_database.get_catalog_qid(catalog)
     person_pid = target_database.get_person_pid(catalog)
@@ -361,7 +377,7 @@ def add_people_statements(
             catalog_qid=catalog_qid,
             catalog_pid=person_pid,
             catalog_id=catalog_id,
-            edit_summary=LINKS_VALIDATION_SUMMARY
+            edit_summary=edit_summary
         )
 
 
diff --git a/soweego/linker/baseline.py b/soweego/linker/baseline.py
index f1ced1fb..3ae0ab44 100644
--- a/soweego/linker/baseline.py
+++ b/soweego/linker/baseline.py
@@ -272,7 +272,9 @@ def _handle_result(
                 to_upload.add(statement)
 
     if upload:
-        wikidata_bot.add_people_statements(to_upload, sandbox)
+        wikidata_bot.add_people_statements(
+            catalog, to_upload, 'links', sandbox
+        )
 
     LOGGER.info('%s %s dumped to %s', catalog, origin, path_out)
 

From 46b8080d0b978e008ad931acb80e1b43e3e6f4ed Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Mon, 9 Aug 2021 16:23:36 +0200
Subject: [PATCH 08/22] simpler although a bit more redundant WD upload code;
 update docstrings to use sandbox item 2

---
 soweego/validator/checks.py | 76 ++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index ae0a4813..0206c17e 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -5,17 +5,16 @@
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
-__version__ = '1.0'
+__version__ = '2.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, Hjfocs'
+__copyright__ = 'Copyleft 2021, Hjfocs'
 
 import csv
 import json
 import logging
 import os
 from collections import defaultdict
-from itertools import zip_longest
-from typing import DefaultDict, Dict, Iterator, List, Tuple
+from typing import DefaultDict, Dict, Iterator, Tuple, Union
 
 import click
 from sqlalchemy.exc import SQLAlchemyError
@@ -59,7 +58,7 @@
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all deprecations on the Wikidata sandbox item Q4115189.',
+    help=f'Perform all deprecations on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 @click.option(
     '--dump-wikidata',
@@ -114,7 +113,10 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
 
     # Deprecate dead ids in Wikidata
     if deprecate:
-        _upload_result(catalog, entity, dead, None, None, sandbox)
+        LOGGER.info('Starting deprecation of %s IDs ...', catalog)
+        wikidata_bot.delete_or_deprecate_identifiers(
+            'deprecate', catalog, entity, dead, sandbox
+        )
 
 
 @click.command()
@@ -137,7 +139,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all edits on the Wikidata sandbox item Q4115189.',
+    help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 @click.option(
     '--dump-wikidata',
@@ -189,13 +191,13 @@ def links_cli(
         with open(wd_links_path) as wdin:
             wd_links = _load_wd_cache(wdin)
         # Discard the last return value: Wikidata cache
-        ids_to_be_deprecated, ids_to_be_added, urls_to_be_added, _ = links(
+        ids_to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, _ = links(
             catalog, entity, blacklist, wd_cache=wd_links
         )
     else:
         (
             ids_to_be_deprecated,
-            ids_to_be_added,
+            ext_ids_to_be_added,
             urls_to_be_added,
             wd_links,
         ) = links(catalog, entity, blacklist)
@@ -211,18 +213,23 @@ def links_cli(
 
     # Dump output files
     _dump_deprecated(ids_to_be_deprecated, deprecated_path)
-    _dump_csv_output(ids_to_be_added, ids_path, 'third-party IDs')
+    _dump_csv_output(ext_ids_to_be_added, ids_path, 'third-party IDs')
     _dump_csv_output(urls_to_be_added, urls_path, 'URLs')
 
     # Upload the output to Wikidata
     if upload:
-        _upload_result(
-            catalog,
-            entity,
-            ids_to_be_deprecated,
-            urls_to_be_added,
-            ids_to_be_added,
-            sandbox,
+        criterion = 'links'
+        LOGGER.info('Starting deprecation of %s IDs ...', catalog)
+        wikidata_bot.delete_or_deprecate_identifiers(
+            'deprecate', catalog, entity, ids_to_be_deprecated, sandbox
+        )
+        LOGGER.info('Starting addition of external IDs to Wikidata ...')
+        wikidata_bot.add_people_statements(
+            catalog, ext_ids_to_be_added, criterion, sandbox
+        )
+        LOGGER.info('Starting addition of statements to Wikidata ...')
+        wikidata_bot.add_people_statements(
+            catalog, urls_to_be_added, criterion, sandbox
         )
 
 
@@ -240,7 +247,7 @@ def links_cli(
     '-s',
     '--sandbox',
     is_flag=True,
-    help='Perform all edits on the Wikidata sandbox item Q4115189.',
+    help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 @click.option(
     '--dump-wikidata',
@@ -302,7 +309,15 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
 
     # Upload the output to Wikidata
     if upload:
-        _upload(catalog, entity, to_be_deprecated, to_be_added, sandbox)
+        criterion = 'bio'
+        LOGGER.info('Starting deprecation of %s IDs ...', catalog)
+        wikidata_bot.delete_or_deprecate_identifiers(
+            'deprecate', catalog, entity, to_be_deprecated, sandbox
+        )
+        LOGGER.info('Starting addition of statements to Wikidata ...')
+        wikidata_bot.add_people_statements(
+            catalog, to_be_added, criterion, sandbox
+        )
 
 
 def dead_ids(
@@ -386,7 +401,7 @@ def dead_ids(
 
 def links(
     catalog: str, entity: str, url_blacklist=False, wd_cache=None
-) -> Tuple[DefaultDict, List, List, Dict]:
+) -> Union[Tuple[defaultdict, list, list, dict], Tuple[None, None, None, None]]:
     """Validate identifiers against available links.
 
     Also generate statements based on additional links
@@ -477,7 +492,7 @@ def links(
 
 def bio(
     catalog: str, entity: str, wd_cache=None
-) -> Tuple[DefaultDict, Iterator, Dict]:
+) -> Union[Tuple[defaultdict, Iterator, dict], Tuple[None, None, None]]:
     """Validate identifiers against available biographical data.
 
     Look for:
@@ -566,7 +581,7 @@ def _apply_url_blacklist(url_statements):
 def _bio_to_be_added_generator(to_be_added):
     for (qid, tid,), values in to_be_added.items():
         for pid, value in values:
-            yield (qid, pid, value, tid,)
+            yield qid, pid, value, tid
 
 
 def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
@@ -751,23 +766,6 @@ def _match_dates_by_precision(
     return shared, extra
 
 
-def _upload_result(
-    catalog, entity, to_deprecate, urls_to_add, ext_ids_to_add, sandbox
-):
-    _upload(catalog, entity, to_deprecate, urls_to_add, sandbox)
-    LOGGER.info('Starting addition of external IDs to Wikidata ...')
-    wikidata_bot.add_people_statements(catalog, ext_ids_to_add, sandbox)
-
-
-def _upload(catalog, entity, to_deprecate, to_add, sandbox):
-    LOGGER.info('Starting deprecation of %s IDs ...', catalog)
-    wikidata_bot.delete_or_deprecate_identifiers(
-        'deprecate', catalog, entity, to_deprecate, sandbox
-    )
-    LOGGER.info('Starting addition of statements to Wikidata ...')
-    wikidata_bot.add_people_statements(catalog, to_add, sandbox)
-
-
 def _dump_deprecated(data, outpath):
     if data:
         with open(outpath, 'w') as deprecated:

From df82da1f603258ebcc542187b4cd04d6865e77e0 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Mon, 9 Aug 2021 15:33:53 +0000
Subject: [PATCH 09/22] remove 2

---
 soweego/ingester/wikidata_bot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py
index 882a9b87..5d2a6bc7 100644
--- a/soweego/ingester/wikidata_bot.py
+++ b/soweego/ingester/wikidata_bot.py
@@ -102,7 +102,7 @@
     '-s',
     '--sandbox',
     is_flag=True,
-    help=f'Perform all edits on the Wikidata sandbox 2 item {vocabulary.SANDBOX_2}.',
+    help=f'Perform all edits on the Wikidata sandbox item {vocabulary.SANDBOX_2}.',
 )
 def delete_cli(catalog, entity, invalid_identifiers, sandbox):
     """Delete invalid identifiers.

From b37bad11ee6ff84e9a991862ad57b402b4c725e0 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Mon, 9 Aug 2021 15:34:26 +0000
Subject: [PATCH 10/22] log an info msg when uploading to sandbox

---
 soweego/validator/checks.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 0206c17e..314e6a50 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -218,6 +218,11 @@ def links_cli(
 
     # Upload the output to Wikidata
     if upload:
+        if sandbox:
+            LOGGER.info(
+                'Running on the Wikidata sandbox item %s ...',
+                vocabulary.SANDBOX_2
+            )
         criterion = 'links'
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
         wikidata_bot.delete_or_deprecate_identifiers(
@@ -309,6 +314,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
 
     # Upload the output to Wikidata
     if upload:
+        if sandbox:
+            LOGGER.info(
+                'Running on the Wikidata sandbox item %s ...',
+                vocabulary.SANDBOX_2
+            )
         criterion = 'bio'
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
         wikidata_bot.delete_or_deprecate_identifiers(

From d3427d0a651051ea9d39a7b7b92594d0b1e87ee7 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Tue, 10 Aug 2021 10:42:48 +0000
Subject: [PATCH 11/22] keep track of matching dates to avoid incorrect
 comparisons, closes #412 ; return extra dates in 'ISO_date/precision' format

---
 soweego/validator/checks.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 314e6a50..46bdeb73 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -703,9 +703,14 @@ def _compare_dates(wd, target):
     # `target` has '1986-01-01/9'
     # `shared_dates` will have one element
     shared_dates, extra_dates = set(), set()
+    wd_matches, target_matches = [], []
+
+    for i, wd_elem in enumerate(wd):
+        for j, t_elem in enumerate(target):
+            # Don't compare when already matched
+            if i in wd_matches or j in target_matches:
+                continue
 
-    for wd_elem in wd:
-        for t_elem in target:
             wd_pid, wd_val = wd_elem
             t_pid, t_val = t_elem
 
@@ -734,9 +739,14 @@ def _compare_dates(wd, target):
 
             if shared_date is not None:
                 shared_dates.add(shared_date)
-            if extra_date is not None:
+                # Keep track of matches to avoid useless computation
+                # and incorrect comparisons:
+                # this happens when WD has multiple claims with
+                # the same property
+                wd_matches.append(i)
+                target_matches.append(j)
+            elif extra_date is not None:
                 extra_dates.add(extra_date)
-
     return shared_dates, extra_dates
 
 
@@ -770,9 +780,7 @@ def _match_dates_by_precision(
         shared = wd_elem
     else:
         LOGGER.debug('Target has an extra date: %s', t_timestamp)
-        # Output dates in ISO format
-        # t_elem[0] is the PID
-        extra = (t_elem[0], t_timestamp)
+        extra = t_elem
     return shared, extra
 
 

From 2a90b0fbc366950eee856870ce1d029d596e7f17 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Tue, 10 Aug 2021 10:44:17 +0000
Subject: [PATCH 12/22] expect date strings in 'ISO-date/precision' format:
 avoid 01-01 precision hack

---
 soweego/ingester/wikidata_bot.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py
index 5d2a6bc7..30ad56f1 100644
--- a/soweego/ingester/wikidata_bot.py
+++ b/soweego/ingester/wikidata_bot.py
@@ -727,18 +727,14 @@ def _parse_value(value):
         return pywikibot.ItemPage(REPO, value_is_qid.group())
     # Try to build a date
     try:
-        date_value = date.fromisoformat(value)
-        # Precision hack: it's a year if both month and day are 1
-        precision = (
-            vocabulary.YEAR
-            if date_value.month == 1 and date_value.day == 1
-            else vocabulary.DAY
-        )
+        # A date should be in the form '1984-11-16/11'
+        date_str, precision = value.split('/')
+        date_obj = date.fromisoformat(date_str)
         return pywikibot.WbTime(
-            date_value.year,
-            date_value.month,
-            date_value.day,
-            precision=precision,
+            date_obj.year,
+            date_obj.month,
+            date_obj.day,
+            precision=int(precision),
         )
     # Otherwise return the value as is
     except ValueError:

From 5fda9e8fc3fe5ebbe86c780e282fda7a434f6561 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Tue, 10 Aug 2021 16:38:23 +0000
Subject: [PATCH 13/22] [WIP] start work on issue #413

---
 soweego/commons/data_gathering.py | 35 +++++++++++++++++--------------
 soweego/validator/checks.py       | 15 ++++++++++++-
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py
index c14709aa..86a150ee 100644
--- a/soweego/commons/data_gathering.py
+++ b/soweego/commons/data_gathering.py
@@ -20,7 +20,7 @@
 from sqlalchemy import or_
 from tqdm import tqdm
 
-from soweego.commons import constants, keys, target_database, url_utils
+from soweego.commons import constants, keys, target_database, text_utils, url_utils
 from soweego.commons.db_manager import DBManager
 from soweego.importer import models
 from soweego.wikidata import api_requests, sparql_queries, vocabulary
@@ -382,27 +382,30 @@ def gather_wikidata_biodata(wikidata):
     for qid, pid, value in api_requests.get_biodata(wikidata.keys()):
         parsed = api_requests.parse_value(value)
         if not wikidata[qid].get(keys.BIODATA):
-            wikidata[qid][keys.BIODATA] = set()
-        # `parsed` is a set of labels if the value is a QID
-        # see api_requests.parse_value
+            wikidata[qid][keys.BIODATA] = []
+        # If `parsed` is a set, we have item labels,
+        # see `api_requests.parse_value` behavior
         if isinstance(parsed, set):
-            # The English label for gender should be enough
-            gender = parsed & {keys.MALE, keys.FEMALE}
-            if gender:
-                wikidata[qid][keys.BIODATA].add((pid, gender.pop()))
-            else:
-                # Add a (pid, label) tuple for each element
-                # for better recall
-                for element in parsed:
-                    wikidata[qid][keys.BIODATA].add((pid, element))
-        # `parsed` is a tuple (timestamp, precision) id the value is a date
+            # Keep track of the value QID
+            # Dict key checks are already done in `api_requests.parse_value`,
+            # so no need to redo it here
+            v_qid = value['id']
+            # Normalize & de-duplicate labels
+            # `text_utils.normalize` returns a tuple with two forms
+            # (non-lower, lower): take the lowercased one
+            labels = {text_utils.normalize(label)[1] for label in parsed}
+            # e.g., (P19, Q641, {'venezia', 'venice', ...})
+            wikidata[qid][keys.BIODATA].append(
+                (pid, v_qid, labels)
+            )
+        # If `parsed` is a tuple, we have a (timestamp, precision) date
         elif isinstance(parsed, tuple):
             timestamp, precision = parsed[0], parsed[1]
             # Get rid of time, useless
             timestamp = timestamp.split('T')[0]
-            wikidata[qid][keys.BIODATA].add((pid, f'{timestamp}/{precision}'))
+            wikidata[qid][keys.BIODATA].append((pid, f'{timestamp}/{precision}'))
         else:
-            wikidata[qid][keys.BIODATA].add((pid, parsed))
+            wikidata[qid][keys.BIODATA].append((pid, parsed))
         total += 1
 
     LOGGER.info('Got %d statements', total)
diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 46bdeb73..279ce135 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -678,6 +678,8 @@ def _compute_shared_and_extra(criterion, wd_data, target_data):
         wd_dates, wd_other = _extract_dates(wd_data)
         target_dates, target_other = _extract_dates(target_data)
         shared_dates, extra_dates = _compare_dates(wd_dates, target_dates)
+        import ipdb; ipdb.set_trace()
+        # FIXME data model has changed: _compare_others
         shared = wd_other.intersection(target_other).union(shared_dates)
         extra = target_other.difference(wd_other).union(extra_dates)
     else:
@@ -686,6 +688,14 @@ def _compute_shared_and_extra(criterion, wd_data, target_data):
 
     return shared, extra
 
+def _compare_others(wd, target):
+    shared, extra = set(), set()
+    wd_matches, target_matches = [], []
+
+    for i, wd_elem in enumerate(wd):
+        for j, t_elem in enumerate(target):
+
+
 
 def _extract_dates(data):
     dates = set()
@@ -697,6 +707,7 @@ def _extract_dates(data):
 
 
 def _compare_dates(wd, target):
+def _compare(what, wd, target):
     # Ensure unique comparisons, regardless of different precisions.
     # For instance:
     # `wd` has '1986-01-01/9' and '1986-11-29/11'
@@ -721,11 +732,13 @@ def _compare_dates(wd, target):
             # Skip unexpected `None` values
             if None in (wd_val, t_val):
                 LOGGER.warning(
-                    'Skipping unexpected %s date pair with missing value(s)',
+                    'Skipping unexpected %s pair with missing value(s)',
                     (wd_elem, t_elem),
                 )
                 continue
 
+            if what == 'dates':
+            # FIXME extract function
             wd_timestamp, wd_precision = wd_val.split('/')
             t_timestamp, t_precision = t_val.split('/')
 

From 00b52b7e782c2c835511bdc33e146f028af8ebbc Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Tue, 10 Aug 2021 21:12:57 +0200
Subject: [PATCH 14/22] [WIP] comparison logic for values other than dates;
 refactor comparison

---
 soweego/validator/checks.py | 125 +++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 53 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 279ce135..adf3023b 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -19,7 +19,7 @@
 import click
 from sqlalchemy.exc import SQLAlchemyError
 
-from soweego.commons import constants, data_gathering, keys, target_database
+from soweego.commons import constants, data_gathering, keys, target_database, text_utils
 from soweego.commons.db_manager import DBManager
 from soweego.ingester import wikidata_bot
 from soweego.wikidata import vocabulary
@@ -673,29 +673,26 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
 
 
 def _compute_shared_and_extra(criterion, wd_data, target_data):
-    # Properly compare dates when checking biographical data
-    if criterion == keys.BIODATA:
+    if criterion == keys.LINKS:
+        shared = wd_data.intersection(target_data)
+        extra = target_data.difference(wd_data)
+    # Biographical validation requires more complex comparisons
+    elif criterion == keys.BIODATA:
         wd_dates, wd_other = _extract_dates(wd_data)
         target_dates, target_other = _extract_dates(target_data)
-        shared_dates, extra_dates = _compare_dates(wd_dates, target_dates)
+        shared_dates, extra_dates = _compare('dates', wd_dates, target_dates)
         import ipdb; ipdb.set_trace()
-        # FIXME data model has changed: _compare_others
-        shared = wd_other.intersection(target_other).union(shared_dates)
-        extra = target_other.difference(wd_other).union(extra_dates)
+        shared_other, extra_other = _compare('other', wd_other, target_other)
+        shared = shared_dates | shared_other
+        extra = extra_dates | extra_other
     else:
-        shared = wd_data.intersection(target_data)
-        extra = target_data.difference(wd_data)
+        raise ValueError(
+            f"Invalid validation criterion: '{criterion}'. "
+            f"Please use either '{keys.LINKS}' or '{keys.BIODATA}'"
+        )
 
     return shared, extra
 
-def _compare_others(wd, target):
-    shared, extra = set(), set()
-    wd_matches, target_matches = [], []
-
-    for i, wd_elem in enumerate(wd):
-        for j, t_elem in enumerate(target):
-
-
 
 def _extract_dates(data):
     dates = set()
@@ -706,14 +703,12 @@ def _extract_dates(data):
     return dates, data.difference(dates)
 
 
-def _compare_dates(wd, target):
 def _compare(what, wd, target):
-    # Ensure unique comparisons, regardless of different precisions.
-    # For instance:
-    # `wd` has '1986-01-01/9' and '1986-11-29/11'
-    # `target` has '1986-01-01/9'
-    # `shared_dates` will have one element
-    shared_dates, extra_dates = set(), set()
+    shared, extra = set(), set()
+    # Keep track of matches to avoid useless computation
+    # and incorrect comparisons:
+    # this happens when WD has multiple claims with
+    # the same property
     wd_matches, target_matches = [], []
 
     for i, wd_elem in enumerate(wd):
@@ -722,45 +717,68 @@ def _compare(what, wd, target):
             if i in wd_matches or j in target_matches:
                 continue
 
-            wd_pid, wd_val = wd_elem
-            t_pid, t_val = t_elem
-
-            # Don't compare birth with death dates
-            if wd_pid != t_pid:
+            # Don't compare different PIDs
+            if wd_elem[0] != t_elem[0]:
                 continue
 
             # Skip unexpected `None` values
-            if None in (wd_val, t_val):
+            if None in (wd_elem[1], t_elem[1]):
                 LOGGER.warning(
                     'Skipping unexpected %s pair with missing value(s)',
                     (wd_elem, t_elem),
                 )
                 continue
 
-            if what == 'dates':
-            # FIXME extract function
-            wd_timestamp, wd_precision = wd_val.split('/')
-            t_timestamp, t_precision = t_val.split('/')
-
-            shared_date, extra_date = _match_dates_by_precision(
-                min(int(wd_precision), int(t_precision)),
-                wd_elem,
-                wd_timestamp,
-                t_elem,
-                t_timestamp,
+            inputs = (
+                shared, extra, wd_matches, target_matches,
+                i, wd_elem, j, t_elem
             )
+            if what == 'dates':
+                _compare_dates(inputs)
+            elif what == 'other':
+                _compare_other(inputs)
+            else:
+                raise ValueError(
+                    f"Invalid argument: '{what}'. "
+                    "Please use either 'dates' or 'other'"
+                )
+
+    return shared, extra
+
 
-            if shared_date is not None:
-                shared_dates.add(shared_date)
-                # Keep track of matches to avoid useless computation
-                # and incorrect comparisons:
-                # this happens when WD has multiple claims with
-                # the same property
-                wd_matches.append(i)
-                target_matches.append(j)
-            elif extra_date is not None:
-                extra_dates.add(extra_date)
-    return shared_dates, extra_dates
+def _compare_other(inputs):
+    shared, extra, wd_matches, target_matches, i, wd_elem, j, t_elem = inputs
+    pid, qid, wd_values = wd_elem
+    _, t_value = t_elem
+
+    # TODO improve matching
+    if text_utils.normalize(t_value) in wd_values:
+        shared.add((pid, qid))
+        wd_matches.append(i)
+        target_matches.append(j)
+    else:
+        # TODO resolve target string into QID
+        extra.add((pid, t_value))
+
+
+def _compare_dates(inputs):
+    shared, extra, wd_matches, target_matches, i, wd_elem, j, t_elem = inputs
+
+    wd_timestamp, wd_precision = wd_elem[1].split('/')
+    t_timestamp, t_precision = t_elem[1].split('/')
+    shared_date, extra_date = _match_dates_by_precision(
+        min(int(wd_precision), int(t_precision)),
+        wd_elem,
+        wd_timestamp,
+        t_elem,
+        t_timestamp,
+    )
+    if shared_date is not None:
+        shared.add(shared_date)
+        wd_matches.append(i)
+        target_matches.append(j)
+    elif extra_date is not None:
+        extra.add(extra_date)
 
 
 def _match_dates_by_precision(
@@ -820,6 +838,7 @@ def _dump_csv_output(data, outpath, log_msg_subject):
         LOGGER.info("No %s to be added, won't dump to file", log_msg_subject)
 
 
+# FIXME adapt to new data model
 def _load_wd_cache(file_handle):
     raw_cache = json.load(file_handle)
     cache = {}
@@ -853,7 +872,7 @@ def _dump_wd_cache(cache, outpath):
         json.dump(
             {
                 qid: {
-                    data_type: list(values)
+                    data_type: values
                     for data_type, values in data.items()
                 }
                 for qid, data in cache.items()

From 93bc7f4c20a974df78ebb78a70b9e3464a78d734 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Thu, 12 Aug 2021 10:41:41 +0000
Subject: [PATCH 15/22] feeling-lucky resolution of a QID given a term, closes
 #414

---
 soweego/wikidata/api_requests.py | 47 ++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py
index 61cb7a99..9cdff417 100644
--- a/soweego/wikidata/api_requests.py
+++ b/soweego/wikidata/api_requests.py
@@ -25,9 +25,9 @@
 
 __author__ = 'Marco Fossati'
 __email__ = 'fossati@spaziodati.eu'
-__version__ = '1.0'
+__version__ = '2.0'
 __license__ = 'GPL-3.0'
-__copyright__ = 'Copyleft 2018, Hjfocs'
+__copyright__ = 'Copyleft 2021, Hjfocs'
 
 
 LOGGER = logging.getLogger(__name__)
@@ -39,8 +39,45 @@
 BUCKET_SIZE = 500
 
 
+def resolve_qid(term: str, language='en') -> str:
+    """Try to resolve a QID given a search term, in a *feeling lucky* way.
+
+    :param term: a search term
+    :param language: (optional) search in the given language code.
+      Default: ``en``.
+    :return: the QID of the first result
+    """
+    params = {
+        'action': 'wbsearchentities',
+        'format': 'json',
+        'search': term,
+        'language': language
+    }
+    response_body = _make_request(params)
+
+    # Failed API request
+    if response_body is None:
+        return None
+
+    try:
+        return response_body['search'][0]['id']
+    # Malformed JSON response
+    except KeyError as e:
+        LOGGER.error(
+            "Missing '%s' key from JSON response: %s", e, response_body
+        )
+        return None
+    # No search results
+    except IndexError:
+        LOGGER.info(
+            "No QIDs found for search term '%s' (language: %s)",
+            term, language
+        )
+        return None
+
+
 def get_url_blacklist() -> set:
-    """Retrieve a blacklist with URL domains of low-quality sources
+    """Retrieve a blacklist with URL domains of low-quality sources.
 
     :return: the set of blacklisted domains
     """
@@ -56,12 +93,12 @@ def get_url_blacklist() -> set:
     if response_body is None:
         return None
 
-    # Handle malformed JSON response
+    # Malformed JSON response
     try:
         star = response_body['parse']['text']['*']  # Interesting nonsense key
     except KeyError as e:
         LOGGER.error(
-            "Missing key %s from JSON response: %s", e, response_body,
+            "Missing '%s' key from JSON response: %s", e, response_body
         )
         return None
 

From 5072c78f514926543ba23644c19c9457f9c0064e Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Thu, 12 Aug 2021 10:54:06 +0000
Subject: [PATCH 16/22] closes #413 ; closes #417 ; see #414

Fix bio values comparison; dump shared statements; feeling-lucky QID resolution;
pickle Wikidata cache, instead of JSON dump; simplify bio comparison;
revisit & rename variables.
---
 soweego/validator/checks.py | 286 ++++++++++++++++++------------------
 1 file changed, 146 insertions(+), 140 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index adf3023b..88011978 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -13,6 +13,7 @@
 import json
 import logging
 import os
+import pickle
 from collections import defaultdict
 from typing import DefaultDict, Dict, Iterator, Tuple, Union
 
@@ -27,18 +28,18 @@
 
 LOGGER = logging.getLogger(__name__)
 
-# For dead_ids_cli
-DEAD_IDS_FILENAME = '{}_{}_dead_identifiers.json'
-WD_IDS_FILENAME = '{}_{}_identifiers_in_wikidata.json'
-# For links_cli
-LINKS_IDS_TO_BE_DEPRECATED_FILENAME = '{}_{}_identifiers_to_be_deprecated.json'
-EXTRA_IDS_TO_BE_ADDED_FILENAME = '{}_{}_third_party_identifiers_to_be_added.csv'
-URLS_TO_BE_ADDED_FILENAME = '{}_{}_urls_to_be_added.csv'
-WD_LINKS_FILENAME = '{}_{}_urls_in_wikidata.json'
-# For bio_cli
-BIO_IDS_TO_BE_DEPRECATED_FILENAME = '{}_{}_identifiers_to_be_deprecated.json'
-BIO_STATEMENTS_TO_BE_ADDED_FILENAME = '{}_{}_bio_statements_to_be_added.csv'
-WD_BIO_FILENAME = '{}_{}_bio_data_in_wikidata.json'
+# File name templates
+# For all CLIs
+WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl'
+IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json'
+SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv'
+# For `dead_ids_cli`
+DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json'
+# For `links_cli`
+EXT_IDS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_external_ids_to_be_added.csv'
+URLS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_urls_to_be_added.csv'
+# For `bio_cli`
+BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv'
 
 
 @click.command()
@@ -63,7 +64,7 @@
 @click.option(
     '--dump-wikidata',
     is_flag=True,
-    help='Dump identifiers gathered from Wikidata to a JSON file.',
+    help='Dump identifiers gathered from Wikidata to a Python pickle.',
 )
 @click.option(
     '--dir-io',
@@ -80,24 +81,30 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
     you can pass the '-d' flag to do so.
     """
     dead_ids_path = os.path.join(
-        dir_io, DEAD_IDS_FILENAME.format(catalog, entity)
+        dir_io, DEAD_IDS_FNAME.format(catalog=catalog, entity=entity)
+    )
+    wd_cache_path = os.path.join(
+        dir_io, WD_CACHE_FNAME.format(
+            catalog=catalog, entity=entity, criterion='dead_ids'
+        )
     )
-    wd_ids_path = os.path.join(dir_io, WD_IDS_FILENAME.format(catalog, entity))
 
     # Handle Wikidata cache
-    if os.path.isfile(wd_ids_path):
-        with open(wd_ids_path) as wdin:
-            wd_ids = _load_wd_cache(wdin)
+    if os.path.isfile(wd_cache_path):
+        with open(wd_cache_path, 'rb') as cin:
+            wd_cache = pickle.load(cin)
+        LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the second return value: Wikidata cache
-        dead, _ = dead_ids(catalog, entity, wd_cache=wd_ids)
+        dead, _ = dead_ids(catalog, entity, wd_cache=wd_cache)
     else:
-        dead, wd_ids = dead_ids(catalog, entity)
+        dead, wd_cache = dead_ids(catalog, entity)
 
-    # Dump ids gathered from Wikidata
+    # Dump Wikidata cache
     if dump_wikidata:
-        _dump_wd_cache(wd_ids, wd_ids_path)
+        with open(wd_cache_path, 'wb') as cout:
+            pickle.dump(wd_cache, cout)
         LOGGER.info(
-            'Identifiers gathered from Wikidata dumped to %s', wd_ids_path
+            'Identifiers gathered from Wikidata dumped to %s', wd_cache_path
         )
 
     # Dump dead ids
@@ -144,7 +151,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
 @click.option(
     '--dump-wikidata',
     is_flag=True,
-    help='Dump URLs gathered from Wikidata to a JSON file.',
+    help='Dump URLs gathered from Wikidata to a Python pickle.',
 )
 @click.option(
     '--dir-io',
@@ -152,6 +159,7 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
     default=constants.SHARED_FOLDER,
     help=f'Input/output directory, default: {constants.SHARED_FOLDER}.',
 )
+# TODO adapt to also dump shared links
 def links_cli(
     catalog, entity, blacklist, upload, sandbox, dump_wikidata, dir_io
 ):
@@ -172,34 +180,44 @@ def links_cli(
 
     The '-b' flag applies a URL blacklist of low-quality Web domains.
     """
+    criterion = 'links'
     # Output paths
-    deprecated_path = os.path.join(
-        dir_io, LINKS_IDS_TO_BE_DEPRECATED_FILENAME.format(catalog, entity)
+    deprecate_path = os.path.join(
+        dir_io, IDS_TO_BE_DEPRECATED_FNAME.format(
+            catalog=catalog, entity=entity, criterion=criterion
+        )
     )
     ids_path = os.path.join(
-        dir_io, EXTRA_IDS_TO_BE_ADDED_FILENAME.format(catalog, entity)
+        dir_io, EXT_IDS_TO_BE_ADDED_FNAME.format(
+            catalog=catalog, entity=entity
+        )
     )
     urls_path = os.path.join(
-        dir_io, URLS_TO_BE_ADDED_FILENAME.format(catalog, entity)
+        dir_io, URLS_TO_BE_ADDED_FNAME.format(
+            catalog=catalog, entity=entity
+        )
     )
-    wd_links_path = os.path.join(
-        dir_io, WD_LINKS_FILENAME.format(catalog, entity)
+    wd_cache_path = os.path.join(
+        dir_io, WD_CACHE_FNAME.format(
+            catalog=catalog, entity=entity, criterion=criterion
+        )
     )
 
     # Handle Wikidata cache
-    if os.path.isfile(wd_links_path):
-        with open(wd_links_path) as wdin:
-            wd_links = _load_wd_cache(wdin)
+    if os.path.isfile(wd_cache_path):
+        with open(wd_cache_path, 'rb') as cin:
+            wd_cache = pickle.load(cin)
+        LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
         ids_to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, _ = links(
-            catalog, entity, blacklist, wd_cache=wd_links
+            catalog, entity, blacklist, wd_cache=wd_cache
         )
     else:
         (
             ids_to_be_deprecated,
             ext_ids_to_be_added,
             urls_to_be_added,
-            wd_links,
+            wd_cache,
         ) = links(catalog, entity, blacklist)
 
     # Nothing to do: the catalog doesn't contain links
@@ -208,11 +226,12 @@ def links_cli(
 
     # Dump Wikidata cache
     if dump_wikidata:
-        _dump_wd_cache(wd_links, wd_links_path)
-        LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_links_path)
+        with open(wd_cache_path, 'wb') as cout:
+            pickle.dump(wd_cache, cout)
+        LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path)
 
     # Dump output files
-    _dump_deprecated(ids_to_be_deprecated, deprecated_path)
+    _dump_deprecated(ids_to_be_deprecated, deprecate_path)
     _dump_csv_output(ext_ids_to_be_added, ids_path, 'third-party IDs')
     _dump_csv_output(urls_to_be_added, urls_path, 'URLs')
 
@@ -223,7 +242,6 @@ def links_cli(
                 'Running on the Wikidata sandbox item %s ...',
                 vocabulary.SANDBOX_2
             )
-        criterion = 'links'
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
         wikidata_bot.delete_or_deprecate_identifiers(
             'deprecate', catalog, entity, ids_to_be_deprecated, sandbox
@@ -257,7 +275,7 @@ def links_cli(
 @click.option(
     '--dump-wikidata',
     is_flag=True,
-    help='Dump biographical data gathered from Wikidata to a JSON file.',
+    help='Dump biographical data gathered from Wikidata to a Python pickle.',
 )
 @click.option(
     '--dir-io',
@@ -270,47 +288,68 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
 
     Look for birth/death dates, birth/death places, gender.
 
-    Dump 2 output files:
+    Dump 3 output files:
 
     1. catalog IDs to be deprecated. JSON format:
     {catalog_ID: [list of QIDs]}
 
-    2. statements to be added. CSV format:
+    2. shared statements. CSV format:
+    QID,PID,value,catalog_ID
+
+    3. statements to be added. CSV format:
     QID,PID,value,catalog_ID
 
     You can pass the '-u' flag to upload the output to Wikidata.
     """
-    deprecated_path = os.path.join(
-        dir_io, BIO_IDS_TO_BE_DEPRECATED_FILENAME.format(catalog, entity)
+    criterion = 'bio'
+    deprecate_path = os.path.join(
+        dir_io, IDS_TO_BE_DEPRECATED_FNAME.format(
+            catalog=catalog, entity=entity, criterion=criterion
+        )
     )
-    statements_path = os.path.join(
-        dir_io, BIO_STATEMENTS_TO_BE_ADDED_FILENAME.format(catalog, entity)
+    shared_path = os.path.join(
+        dir_io, SHARED_STATEMENTS_FNAME.format(
+            catalog=catalog, entity=entity, criterion=criterion
+        )
+    )
+    extra_path = os.path.join(
+        dir_io, BIO_STATEMENTS_TO_BE_ADDED_FNAME.format(
+            catalog=catalog, entity=entity
+        )
+    )
+    wd_cache_path = os.path.join(
+        dir_io, WD_CACHE_FNAME.format(
+            catalog=catalog, entity=entity, criterion=criterion
+        )
     )
-    wd_bio_path = os.path.join(dir_io, WD_BIO_FILENAME.format(catalog, entity))
 
     # Handle Wikidata cache
-    if os.path.isfile(wd_bio_path):
-        with open(wd_bio_path) as wdin:
-            wd_bio = _load_wd_cache(wdin)
+    if os.path.isfile(wd_cache_path):
+        with open(wd_cache_path, 'rb') as cin:
+            wd_cache = pickle.load(cin)
+        LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
-        to_be_deprecated, to_be_added, _ = bio(catalog, entity, wd_cache=wd_bio)
+        deprecate, shared, extra, _ = bio(catalog, entity, wd_cache=wd_cache)
     else:
-        to_be_deprecated, to_be_added, wd_bio = bio(catalog, entity)
+        deprecate, shared, extra, wd_cache = bio(catalog, entity)
 
     # Nothing to do: the catalog doesn't contain biographical data
-    if to_be_deprecated is None:
+    if deprecate is None:
         return
 
     # Dump Wikidata cache
     if dump_wikidata:
-        _dump_wd_cache(wd_bio, wd_bio_path)
+        with open(wd_cache_path, 'wb') as cout:
+            pickle.dump(wd_cache, cout)
         LOGGER.info(
-            'Biographical data gathered from Wikidata dumped to %s', wd_bio_path
+            'Biographical data gathered from Wikidata dumped to %s',
+            wd_cache_path
         )
 
     # Dump output files
-    _dump_deprecated(to_be_deprecated, deprecated_path)
-    _dump_csv_output(to_be_added, statements_path, 'statements')
+    _dump_deprecated(deprecate, deprecate_path)
+    _dump_csv_output(shared, shared_path, 'shared statements')
+    _dump_csv_output(extra, extra_path, 'extra statements')
 
     # Upload the output to Wikidata
     if upload:
@@ -319,12 +358,15 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
                 'Running on the Wikidata sandbox item %s ...',
                 vocabulary.SANDBOX_2
             )
-        criterion = 'bio'
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
         wikidata_bot.delete_or_deprecate_identifiers(
-            'deprecate', catalog, entity, to_be_deprecated, sandbox
+            'deprecate', catalog, entity, deprecate, sandbox
         )
-        LOGGER.info('Starting addition of statements to Wikidata ...')
+        LOGGER.info('Starting referencing of shared statements in Wikidata ...')
+        wikidata_bot.add_people_statements(
+            catalog, to_be_added, criterion, sandbox
+        )
+        LOGGER.info('Starting addition of extra statements to Wikidata ...')
         wikidata_bot.add_people_statements(
             catalog, to_be_added, criterion, sandbox
         )
@@ -454,7 +496,7 @@ def links(
     if target_links is None:
         return None, None, None, None
 
-    to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set)
+    deprecate, add = defaultdict(set), defaultdict(set)
 
     # Wikidata side
     url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
@@ -473,13 +515,13 @@ def links(
         wd_links = wd_cache
 
     # Validation
-    _validate(keys.LINKS, wd_links, target_links, to_be_deprecated, to_be_added)
+    _validate(keys.LINKS, wd_links, target_links, deprecate, add)
 
     # Separate external IDs from URLs
     (
         ext_ids_to_be_added,
         urls_to_be_added,
-    ) = data_gathering.extract_ids_from_urls(to_be_added, ext_id_pids_to_urls)
+    ) = data_gathering.extract_ids_from_urls(add, ext_id_pids_to_urls)
 
     # Apply URL blacklist
     if url_blacklist:
@@ -492,17 +534,17 @@ def links(
         'URL statements to be added: %d',
         catalog,
         entity,
-        len(to_be_deprecated),
+        len(deprecate),
         len(ext_ids_to_be_added),
         len(urls_to_be_added),
     )
 
-    return to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, wd_links
+    return deprecate, ext_ids_to_be_added, urls_to_be_added, wd_links
 
 
 def bio(
     catalog: str, entity: str, wd_cache=None
-) -> Union[Tuple[defaultdict, Iterator, dict], Tuple[None, None, None]]:
+) -> Union[Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None]]:
     """Validate identifiers against available biographical data.
 
     Look for:
@@ -533,20 +575,21 @@ def bio(
       A supported entity
     :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
       in a previous run
-    :return: 3 objects
+    :return: a ``tuple`` of 4 objects
 
       1. ``dict`` of identifiers that should be deprecated
-      2. ``generator`` of statements that should be added
-      3. ``dict`` of biographical data gathered from Wikidata
+      2. ``generator`` of shared statements that should be referenced
+      3. ``generator`` of statements that should be added
+      4. ``dict`` of biographical data gathered from Wikidata
 
     """
     # Target catalog side first:
     # enable early return in case of no target data
     target_bio = data_gathering.gather_target_biodata(entity, catalog)
     if target_bio is None:
-        return None, None, None
+        return None, None, None, None
 
-    to_be_deprecated, to_be_added = defaultdict(set), defaultdict(set)
+    deprecate, reference, add = defaultdict(set), defaultdict(set), defaultdict(set)
 
     # Wikidata side
     if wd_cache is None:
@@ -562,9 +605,13 @@ def bio(
         wd_bio = wd_cache
 
     # Validation
-    _validate(keys.BIODATA, wd_bio, target_bio, to_be_deprecated, to_be_added)
+    _validate(
+        keys.BIODATA,
+        wd_bio, target_bio,
+        deprecate, reference, add
+    )
 
-    return to_be_deprecated, _bio_to_be_added_generator(to_be_added), wd_bio
+    return deprecate, _bio_statements_generator(reference), _bio_statements_generator(add), wd_bio
 
 
 def _apply_url_blacklist(url_statements):
@@ -588,13 +635,13 @@ def _apply_url_blacklist(url_statements):
     return url_statements
 
 
-def _bio_to_be_added_generator(to_be_added):
-    for (qid, tid,), values in to_be_added.items():
+def _bio_statements_generator(stmts_dict):
+    for (qid, tid), values in stmts_dict.items():
         for pid, value in values:
             yield qid, pid, value, tid
 
 
-def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
+def _validate(criterion, wd, target_generator, deprecate, reference, add):
     LOGGER.info('Starting check against target %s ...', criterion)
     target = _consume_target_generator(target_generator)
 
@@ -641,7 +688,7 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
                         qid,
                         tid,
                     )
-                    to_be_deprecated[tid].add(qid)
+                    deprecate[tid].add(qid)
                 else:
                     LOGGER.debug(
                         '%s and %s share these %s: %s',
@@ -650,6 +697,7 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
                         criterion,
                         shared_data,
                     )
+                    reference[(qid, tid)].update(shared_data)
 
                 if extra_data:
                     LOGGER.debug(
@@ -659,16 +707,18 @@ def _validate(criterion, wd, target_generator, to_be_deprecated, to_be_added):
                         qid,
                         extra_data,
                     )
-                    to_be_added[(qid, tid,)].update(extra_data)
+                    add[(qid, tid)].update(extra_data)
                 else:
                     LOGGER.debug('%s has no extra %s', tid, criterion)
 
     LOGGER.info(
         'Check against target %s completed: %d IDs to be deprecated, '
+        '%d Wikidata items with shared statements to be referenced, ',
         '%d Wikidata items with statements to be added',
         criterion,
-        len(to_be_deprecated),
-        len(to_be_added),
+        len(deprecate),
+        len(reference),
+        len(add),
     )
 
 
@@ -678,10 +728,17 @@ def _compute_shared_and_extra(criterion, wd_data, target_data):
         extra = target_data.difference(wd_data)
     # Biographical validation requires more complex comparisons
     elif criterion == keys.BIODATA:
-        wd_dates, wd_other = _extract_dates(wd_data)
-        target_dates, target_other = _extract_dates(target_data)
+        # `wd_data` has either couples or triples: couples are dates
+        wd_dates = set(filter(lambda x: len(x) == 2, wd_data))
+        # No cast to `set` because `wd_data` triples hold sets themselves
+        wd_other = list(filter(lambda x: len(x) == 3, wd_data))
+        # In `target_data` we look for relevant date PIDs
+        target_dates = set(filter(
+            lambda x: x[0] in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH),
+            target_data
+        ))
+        target_other = target_data.difference(target_dates)
         shared_dates, extra_dates = _compare('dates', wd_dates, target_dates)
-        import ipdb; ipdb.set_trace()
         shared_other, extra_other = _compare('other', wd_other, target_other)
         shared = shared_dates | shared_other
         extra = extra_dates | extra_other
@@ -694,15 +751,6 @@ def _compute_shared_and_extra(criterion, wd_data, target_data):
     return shared, extra
 
 
-def _extract_dates(data):
-    dates = set()
-    for pid, value in data:
-        if pid in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH):
-            dates.add((pid, value))
-    # Separate dates from other data
-    return dates, data.difference(dates)
-
-
 def _compare(what, wd, target):
     shared, extra = set(), set()
     # Keep track of matches to avoid useless computation
@@ -751,14 +799,17 @@ def _compare_other(inputs):
     pid, qid, wd_values = wd_elem
     _, t_value = t_elem
 
+    # Take the lowercased normalized value
     # TODO improve matching
-    if text_utils.normalize(t_value) in wd_values:
+    _, t_normalized = text_utils.normalize(t_value)
+    if t_normalized in wd_values:
         shared.add((pid, qid))
         wd_matches.append(i)
         target_matches.append(j)
     else:
-        # TODO resolve target string into QID
-        extra.add((pid, t_value))
+        t_qid = api_requests.resolve_qid(t_normalized)
+        if t_qid is not None:
+            extra.add((pid, t_qid))
 
 
 def _compare_dates(inputs):
@@ -838,51 +889,6 @@ def _dump_csv_output(data, outpath, log_msg_subject):
         LOGGER.info("No %s to be added, won't dump to file", log_msg_subject)
 
 
-# FIXME adapt to new data model
-def _load_wd_cache(file_handle):
-    raw_cache = json.load(file_handle)
-    cache = {}
-    for qid, data in raw_cache.items():
-        for data_type, value_list in data.items():
-            # Biodata has values that are a list
-            if value_list and isinstance(value_list[0], list):
-                value_set = set()
-                for value in value_list:
-                    if isinstance(value[1], list):
-                        same_pid, different_values = value[0], value[1]
-                        for val in different_values:
-                            value_set.add((same_pid, val))
-                    else:
-                        value_set.add(tuple(value))
-                if cache.get(qid):
-                    cache[qid][data_type] = value_set
-                else:
-                    cache[qid] = {data_type: value_set}
-            else:
-                if cache.get(qid):
-                    cache[qid][data_type] = set(value_list)
-                else:
-                    cache[qid] = {data_type: set(value_list)}
-    LOGGER.info("Loaded Wikidata cache from '%s'", file_handle.name)
-    return cache
-
-
-def _dump_wd_cache(cache, outpath):
-    with open(outpath, 'w') as outfile:
-        json.dump(
-            {
-                qid: {
-                    data_type: values
-                    for data_type, values in data.items()
-                }
-                for qid, data in cache.items()
-            },
-            outfile,
-            indent=2,
-            ensure_ascii=False,
-        )
-
-
 def _consume_target_generator(target_generator):
     target = defaultdict(set)
     for identifier, *data in target_generator:

From cbc36b9d40fb2f2efdd5300c83b4e33b01e87d03 Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Thu, 12 Aug 2021 16:03:43 +0200
Subject: [PATCH 17/22] use P2888 for bare URLs statements, closes #409

---
 soweego/commons/data_gathering.py | 2 +-
 soweego/wikidata/vocabulary.py    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py
index 86a150ee..15085635 100644
--- a/soweego/commons/data_gathering.py
+++ b/soweego/commons/data_gathering.py
@@ -527,7 +527,7 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls):
                 ext_ids_to_add.append((qid, pid, ext_id, tid,))
             else:
                 urls_to_add.append(
-                    (qid, vocabulary.DESCRIBED_AT_URL, url, tid,)
+                    (qid, vocabulary.EXACT_MATCH, url, tid,)
                 )
     return (
         ext_ids_to_add,
diff --git a/soweego/wikidata/vocabulary.py b/soweego/wikidata/vocabulary.py
index 7219e5fc..421204af 100644
--- a/soweego/wikidata/vocabulary.py
+++ b/soweego/wikidata/vocabulary.py
@@ -53,6 +53,9 @@
 # Widely used generic property to hold URLs
 DESCRIBED_AT_URL = 'P973'
 OFFICIAL_WEBSITE = 'P856'
+# See BrokenSegue's comment at
+# https://www.wikidata.org/wiki/Wikidata:Project_chat/Archive/2021/08#URLs_statistics_for_Discogs_(Q504063)_and_MusicBrainz_(Q14005)
+EXACT_MATCH = 'P2888'
 
 # Class QID of supported entities
 # People

From 8205c80a8d23bd0e282e15bf89d1a20c2aa647c7 Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Thu, 12 Aug 2021 16:22:38 +0200
Subject: [PATCH 18/22] validation order: deprecate, add, reference; links
 validation: dump shared statements; revisit & rename variables

---
 soweego/validator/checks.py | 177 +++++++++++++++++++++---------------
 1 file changed, 102 insertions(+), 75 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 88011978..ce0349ea 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -23,7 +23,7 @@
 from soweego.commons import constants, data_gathering, keys, target_database, text_utils
 from soweego.commons.db_manager import DBManager
 from soweego.ingester import wikidata_bot
-from soweego.wikidata import vocabulary
+from soweego.wikidata import vocabulary, api_requests
 from soweego.wikidata.api_requests import get_url_blacklist
 
 LOGGER = logging.getLogger(__name__)
@@ -36,8 +36,8 @@
 # For `dead_ids_cli`
 DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json'
 # For `links_cli`
-EXT_IDS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_external_ids_to_be_added.csv'
-URLS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_urls_to_be_added.csv'
+EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv'
+URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv'
 # For `bio_cli`
 BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv'
 
@@ -159,13 +159,12 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
     default=constants.SHARED_FOLDER,
     help=f'Input/output directory, default: {constants.SHARED_FOLDER}.',
 )
-# TODO adapt to also dump shared links
 def links_cli(
     catalog, entity, blacklist, upload, sandbox, dump_wikidata, dir_io
 ):
     """Validate identifiers against links.
 
-    Dump 3 output files:
+    Dump 5 output files:
 
     1. catalog IDs to be deprecated. JSON format:
     {catalog_ID: [list of QIDs]}
@@ -174,11 +173,15 @@ def links_cli(
     QID,third-party_PID,third-party_ID,catalog_ID
 
     3. URLs to be added. CSV format:
-    QID,P973,URL,catalog_ID
+    QID,P2888,URL,catalog_ID
+
+    4. third-party IDs to be referenced. Same format as file #2
+
+    5. URLs to be referenced. Same format as file #3
 
     You can pass the '-u' flag to upload the output to Wikidata.
 
-    The '-b' flag applies a URL blacklist of low-quality Web domains.
+    The '-b' flag applies a URL blacklist of low-quality Web domains to file #3.
     """
     criterion = 'links'
     # Output paths
@@ -187,14 +190,24 @@ def links_cli(
             catalog=catalog, entity=entity, criterion=criterion
         )
     )
-    ids_path = os.path.join(
-        dir_io, EXT_IDS_TO_BE_ADDED_FNAME.format(
-            catalog=catalog, entity=entity
+    add_ext_ids_path = os.path.join(
+        dir_io, EXT_IDS_FNAME.format(
+            catalog=catalog, entity=entity, task='added'
         )
     )
-    urls_path = os.path.join(
-        dir_io, URLS_TO_BE_ADDED_FNAME.format(
-            catalog=catalog, entity=entity
+    add_urls_path = os.path.join(
+        dir_io, URLS_FNAME.format(
+            catalog=catalog, entity=entity, task='added'
+        )
+    )
+    ref_ext_ids_path = os.path.join(
+        dir_io, EXT_IDS_FNAME.format(
+            catalog=catalog, entity=entity, task='referenced'
+        )
+    )
+    ref_urls_path = os.path.join(
+        dir_io, URLS_FNAME.format(
+            catalog=catalog, entity=entity, task='referenced'
         )
     )
     wd_cache_path = os.path.join(
@@ -209,19 +222,16 @@ def links_cli(
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
-        ids_to_be_deprecated, ext_ids_to_be_added, urls_to_be_added, _ = links(
+        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, _ = links(
             catalog, entity, blacklist, wd_cache=wd_cache
         )
     else:
-        (
-            ids_to_be_deprecated,
-            ext_ids_to_be_added,
-            urls_to_be_added,
-            wd_cache,
-        ) = links(catalog, entity, blacklist)
+        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links(
+            catalog, entity, blacklist
+        )
 
     # Nothing to do: the catalog doesn't contain links
-    if ids_to_be_deprecated is None:
+    if deprecate is None:
         return
 
     # Dump Wikidata cache
@@ -231,9 +241,11 @@ def links_cli(
         LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path)
 
     # Dump output files
-    _dump_deprecated(ids_to_be_deprecated, deprecate_path)
-    _dump_csv_output(ext_ids_to_be_added, ids_path, 'third-party IDs')
-    _dump_csv_output(urls_to_be_added, urls_path, 'URLs')
+    _dump_deprecated(deprecate, deprecate_path)
+    _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added')
+    _dump_csv_output(add_urls, add_urls_path, 'URLs to be added')
+    _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced')
+    _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced')
 
     # Upload the output to Wikidata
     if upload:
@@ -244,15 +256,23 @@ def links_cli(
             )
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
         wikidata_bot.delete_or_deprecate_identifiers(
-            'deprecate', catalog, entity, ids_to_be_deprecated, sandbox
+            'deprecate', catalog, entity, deprecate, sandbox
         )
         LOGGER.info('Starting addition of external IDs to Wikidata ...')
         wikidata_bot.add_people_statements(
-            catalog, ext_ids_to_be_added, criterion, sandbox
+            catalog, add_ext_ids, criterion, sandbox
+        )
+        LOGGER.info('Starting addition of URLs to Wikidata ...')
+        wikidata_bot.add_people_statements(
+            catalog, add_urls, criterion, sandbox
+        )
+        LOGGER.info('Starting referencing of shared external IDs in Wikidata ...')
+        wikidata_bot.add_people_statements(
+            catalog, add_ext_ids, criterion, sandbox
         )
-        LOGGER.info('Starting addition of statements to Wikidata ...')
+        LOGGER.info('Starting referencing of shared URLs in Wikidata ...')
         wikidata_bot.add_people_statements(
-            catalog, urls_to_be_added, criterion, sandbox
+            catalog, add_urls, criterion, sandbox
         )
 
 
@@ -293,11 +313,10 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     1. catalog IDs to be deprecated. JSON format:
     {catalog_ID: [list of QIDs]}
 
-    2. shared statements. CSV format:
+    2. statements to be added. CSV format:
     QID,PID,value,catalog_ID
 
-    3. statements to be added. CSV format:
-    QID,PID,value,catalog_ID
+    3. shared statements to be referenced. Same format as file #2
 
     You can pass the '-u' flag to upload the output to Wikidata.
     """
@@ -307,16 +326,16 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
             catalog=catalog, entity=entity, criterion=criterion
         )
     )
-    shared_path = os.path.join(
-        dir_io, SHARED_STATEMENTS_FNAME.format(
-            catalog=catalog, entity=entity, criterion=criterion
-        )
-    )
-    extra_path = os.path.join(
+    add_path = os.path.join(
         dir_io, BIO_STATEMENTS_TO_BE_ADDED_FNAME.format(
             catalog=catalog, entity=entity
         )
     )
+    ref_path = os.path.join(
+        dir_io, SHARED_STATEMENTS_FNAME.format(
+            catalog=catalog, entity=entity, criterion=criterion
+        )
+    )
     wd_cache_path = os.path.join(
         dir_io, WD_CACHE_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
@@ -329,9 +348,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
-        deprecate, shared, extra, _ = bio(catalog, entity, wd_cache=wd_cache)
+        deprecate, add, reference, _ = bio(catalog, entity, wd_cache=wd_cache)
     else:
-        deprecate, shared, extra, wd_cache = bio(catalog, entity)
+        deprecate, add, reference, wd_cache = bio(catalog, entity)
 
     # Nothing to do: the catalog doesn't contain biographical data
     if deprecate is None:
@@ -348,10 +367,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
 
     # Dump output files
     _dump_deprecated(deprecate, deprecate_path)
-    _dump_csv_output(shared, shared_path, 'shared statements')
-    _dump_csv_output(extra, extra_path, 'extra statements')
+    _dump_csv_output(add, add_path, 'statements to be added')
+    _dump_csv_output(reference, ref_path, 'shared statements to be referenced')
 
-    # Upload the output to Wikidata
+    # Upload the output to Wikidata:
+    # deprecate, add, reference
     if upload:
         if sandbox:
             LOGGER.info(
@@ -362,13 +382,13 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
         wikidata_bot.delete_or_deprecate_identifiers(
             'deprecate', catalog, entity, deprecate, sandbox
         )
-        LOGGER.info('Starting referencing of shared statements in Wikidata ...')
+        LOGGER.info('Starting addition of extra statements to Wikidata ...')
         wikidata_bot.add_people_statements(
-            catalog, to_be_added, criterion, sandbox
+            catalog, add, criterion, sandbox
         )
-        LOGGER.info('Starting addition of extra statements to Wikidata ...')
+        LOGGER.info('Starting referencing of shared statements in Wikidata ...')
         wikidata_bot.add_people_statements(
-            catalog, to_be_added, criterion, sandbox
+            catalog, reference, criterion, sandbox
         )
 
 
@@ -453,7 +473,7 @@ def dead_ids(
 
 def links(
     catalog: str, entity: str, url_blacklist=False, wd_cache=None
-) -> Union[Tuple[defaultdict, list, list, dict], Tuple[None, None, None, None]]:
+) -> Union[Tuple[defaultdict, list, list, list, list, dict], Tuple[None, None, None, None, None, None]]:
     """Validate identifiers against available links.
 
     Also generate statements based on additional links
@@ -482,21 +502,23 @@ def links(
       of URL domains. Default: ``False``
     :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
       in a previous run. Default: ``None``
-    :return: 4 objects
+    :return: ``tuple`` of 6 objects
 
       1. ``dict`` of identifiers that should be deprecated
       2. ``list`` of third-party identifiers that should be added
       3. ``list`` of URLs that should be added
-      4. ``dict`` of links gathered from Wikidata
+      4. ``list`` of third-party identifiers that should be referenced
+      5. ``list`` of URLs that should be referenced
+      6. ``dict`` of links gathered from Wikidata
 
     """
     # Target catalog side first:
     # enable early return in case of no target links
     target_links = data_gathering.gather_target_links(entity, catalog)
     if target_links is None:
-        return None, None, None, None
+        return None, None, None, None, None, None
 
-    deprecate, add = defaultdict(set), defaultdict(set)
+    deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set)
 
     # Wikidata side
     url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
@@ -515,31 +537,36 @@ def links(
         wd_links = wd_cache
 
     # Validation
-    _validate(keys.LINKS, wd_links, target_links, deprecate, add)
+    _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference)
 
-    # Separate external IDs from URLs
-    (
-        ext_ids_to_be_added,
-        urls_to_be_added,
-    ) = data_gathering.extract_ids_from_urls(add, ext_id_pids_to_urls)
-
-    # Apply URL blacklist
+    # Links to be added:
+    # 1. Separate external IDs from URLs
+    add_ext_ids, add_urls = data_gathering.extract_ids_from_urls(
+        add, ext_id_pids_to_urls
+    )
+    # 2. Apply URL blacklist
     if url_blacklist:
-        urls_to_be_added = _apply_url_blacklist(urls_to_be_added)
+        add_urls = _apply_url_blacklist(add_urls)
+
+    # Links to be referenced: separate external IDs from URLs
+    ref_ext_ids, ref_urls = data_gathering.extract_ids_from_urls(
+        reference, ext_id_pids_to_urls
+    )
 
     LOGGER.info(
         'Validation completed. Target: %s %s. '
         'IDs to be deprecated: %d. '
         'Third-party IDs to be added: %d. '
         'URL statements to be added: %d',
-        catalog,
-        entity,
+        'Third-party IDs to be referenced: %d. '
+        'URL statements to be referenced: %d',
+        catalog, entity,
         len(deprecate),
-        len(ext_ids_to_be_added),
-        len(urls_to_be_added),
+        len(add_ext_ids), len(add_urls),
+        len(ref_ext_ids), len(ref_urls)
     )
 
-    return deprecate, ext_ids_to_be_added, urls_to_be_added, wd_links
+    return deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_links
 
 
 def bio(
@@ -578,8 +605,8 @@ def bio(
     :return: a ``tuple`` of 4 objects
 
       1. ``dict`` of identifiers that should be deprecated
-      2. ``generator`` of shared statements that should be referenced
-      3. ``generator`` of statements that should be added
+      2. ``generator`` of statements that should be added
+      3. ``generator`` of shared statements that should be referenced
       4. ``dict`` of biographical data gathered from Wikidata
 
     """
@@ -589,7 +616,7 @@ def bio(
     if target_bio is None:
         return None, None, None, None
 
-    deprecate, reference, add = defaultdict(set), defaultdict(set), defaultdict(set)
+    deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set)
 
     # Wikidata side
     if wd_cache is None:
@@ -608,10 +635,10 @@ def bio(
     _validate(
         keys.BIODATA,
         wd_bio, target_bio,
-        deprecate, reference, add
+        deprecate, add, reference
     )
 
-    return deprecate, _bio_statements_generator(reference), _bio_statements_generator(add), wd_bio
+    return deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), wd_bio
 
 
 def _apply_url_blacklist(url_statements):
@@ -641,7 +668,7 @@ def _bio_statements_generator(stmts_dict):
             yield qid, pid, value, tid
 
 
-def _validate(criterion, wd, target_generator, deprecate, reference, add):
+def _validate(criterion, wd, target_generator, deprecate, add, reference):
     LOGGER.info('Starting check against target %s ...', criterion)
     target = _consume_target_generator(target_generator)
 
@@ -713,12 +740,12 @@ def _validate(criterion, wd, target_generator, deprecate, reference, add):
 
     LOGGER.info(
         'Check against target %s completed: %d IDs to be deprecated, '
-        '%d Wikidata items with shared statements to be referenced, ',
         '%d Wikidata items with statements to be added',
+        '%d Wikidata items with shared statements to be referenced, ',
         criterion,
         len(deprecate),
-        len(reference),
         len(add),
+        len(reference),
     )
 
 
@@ -884,9 +911,9 @@ def _dump_csv_output(data, outpath, log_msg_subject):
         with open(outpath, 'w') as ids_out:
             writer = csv.writer(ids_out)
             writer.writerows(data)
-        LOGGER.info('%s to be added dumped to %s', log_msg_subject, outpath)
+        LOGGER.info('%s dumped to %s', log_msg_subject, outpath)
     else:
-        LOGGER.info("No %s to be added, won't dump to file", log_msg_subject)
+        LOGGER.info("No %s, won't dump to file", log_msg_subject)
 
 
 def _consume_target_generator(target_generator):

From 20906aa7accc2179afe9f9512de05a7239ab2a61 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Fri, 13 Aug 2021 09:43:43 +0000
Subject: [PATCH 19/22] dump output first, then eventual cache; use highest
 pickle protocol; fix log formatting exception

---
 soweego/validator/checks.py | 67 +++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index ce0349ea..ff967438 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -99,14 +99,6 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
     else:
         dead, wd_cache = dead_ids(catalog, entity)
 
-    # Dump Wikidata cache
-    if dump_wikidata:
-        with open(wd_cache_path, 'wb') as cout:
-            pickle.dump(wd_cache, cout)
-        LOGGER.info(
-            'Identifiers gathered from Wikidata dumped to %s', wd_cache_path
-        )
-
     # Dump dead ids
     with open(dead_ids_path, 'w') as fout:
         # Sets are not serializable to JSON, so cast them to lists
@@ -115,9 +107,21 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
             fout,
             indent=2,
         )
-
     LOGGER.info('Dead identifiers dumped to %s', dead_ids_path)
 
+    # Dump Wikidata cache
+    if dump_wikidata:
+        try:
+            with open(wd_cache_path, 'wb') as cout:
+                # Using the highest protocol available for the current Python
+                # version should be the most efficient solution
+                pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
+            LOGGER.info(
+                'Identifiers gathered from Wikidata dumped to %s', wd_cache_path
+            )
+        except MemoryError:
+            LOGGER.warning('Could not pickle the Wikidata cache: memory error')
+
     # Deprecate dead ids in Wikidata
     if deprecate:
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
@@ -234,12 +238,6 @@ def links_cli(
     if deprecate is None:
         return
 
-    # Dump Wikidata cache
-    if dump_wikidata:
-        with open(wd_cache_path, 'wb') as cout:
-            pickle.dump(wd_cache, cout)
-        LOGGER.info('URLs gathered from Wikidata dumped to %s', wd_cache_path)
-
     # Dump output files
     _dump_deprecated(deprecate, deprecate_path)
     _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added')
@@ -247,6 +245,19 @@ def links_cli(
     _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced')
     _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced')
 
+    # Dump Wikidata cache
+    if dump_wikidata:
+        try:
+            with open(wd_cache_path, 'wb') as cout:
+                # Using the highest protocol available for the current Python
+                # version should be the most efficient solution
+                pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
+            LOGGER.info(
+                'URLs gathered from Wikidata dumped to %s', wd_cache_path
+            )
+        except MemoryError:
+            LOGGER.warning('Could not pickle the Wikidata cache: memory error')
+
     # Upload the output to Wikidata
     if upload:
         if sandbox:
@@ -356,20 +367,24 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     if deprecate is None:
         return
 
-    # Dump Wikidata cache
-    if dump_wikidata:
-        with open(wd_cache_path, 'wb') as cout:
-            pickle.dump(wd_cache, cout)
-        LOGGER.info(
-            'Biographical data gathered from Wikidata dumped to %s',
-            wd_cache_path
-        )
-
     # Dump output files
     _dump_deprecated(deprecate, deprecate_path)
     _dump_csv_output(add, add_path, 'statements to be added')
     _dump_csv_output(reference, ref_path, 'shared statements to be referenced')
 
+    # Dump Wikidata cache
+    if dump_wikidata:
+        try:
+            with open(wd_cache_path, 'wb') as cout:
+                # Using the highest protocol available for the current Python
+                # version should be the most efficient solution
+                pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
+            LOGGER.info(
+                'Biographical data  gathered from Wikidata dumped to %s', wd_cache_path
+            )
+        except MemoryError:
+            LOGGER.warning('Could not pickle the Wikidata cache: memory error')
+
     # Upload the output to Wikidata:
     # deprecate, add, reference
     if upload:
@@ -740,8 +755,8 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference):
 
     LOGGER.info(
         'Check against target %s completed: %d IDs to be deprecated, '
-        '%d Wikidata items with statements to be added',
-        '%d Wikidata items with shared statements to be referenced, ',
+        '%d Wikidata items with statements to be added, ',
+        '%d Wikidata items with shared statements to be referenced',
         criterion,
         len(deprecate),
         len(add),

From 39345a6a82d9ca0f67ad7df94f6400942f45dbac Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Wed, 18 Aug 2021 13:04:43 +0200
Subject: [PATCH 20/22] better type annotation & docstring

---
 soweego/wikidata/api_requests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py
index 9cdff417..c5e6ad75 100644
--- a/soweego/wikidata/api_requests.py
+++ b/soweego/wikidata/api_requests.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 from functools import lru_cache, partial
 from multiprocessing.pool import Pool
-from typing import Dict, Iterator, List, Set, TextIO, Tuple, Union
+from typing import Dict, Iterator, List, Optional, Set, TextIO, Tuple, Union
 from urllib.parse import urlunsplit
 
 import lxml.html
@@ -39,13 +39,13 @@
 BUCKET_SIZE = 500
 
 
-def resolve_qid(term: str, language='en') -> str:
+def resolve_qid(term: str, language='en') -> Optional[str]:
     """Try to resolve a QID given a search term, in a *feeling lucky* way.
 
     :param term: a search term
     :param language: (optional) search in the given language code.
       Default: ``en``.
-    :return: the QID of the first result
+    :return: the QID of the first result, or ``None`` in case of no result
     """
     params = {
         'action': 'wbsearchentities',

From fa16a4b45930d41da67f4e3fa7aa76c81c485589 Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Wed, 18 Aug 2021 13:09:13 +0200
Subject: [PATCH 21/22] better type annotation & docstring again

---
 soweego/wikidata/api_requests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py
index c5e6ad75..fb98dc8f 100644
--- a/soweego/wikidata/api_requests.py
+++ b/soweego/wikidata/api_requests.py
@@ -76,10 +76,11 @@ def resolve_qid(term: str, language='en') -> Optional[str]:
         return None
 
 
-def get_url_blacklist() -> set:
+def get_url_blacklist() -> Optional[set]:
     """Retrieve a blacklist with URL domains of low-quality sources.
 
-    :return: the set of blacklisted domains
+    :return: the set of blacklisted domains,
+      or ``None`` in case of issues with the Wikidata Web API
     """
     params = {
         'action': 'parse',

From 661e20bc36635047e1b6e8754670d407850c8632 Mon Sep 17 00:00:00 2001
From: travis <travis@soweego>
Date: Wed, 18 Aug 2021 11:30:45 +0000
Subject: [PATCH 22/22] format code & organize imports

---
 soweego/commons/data_gathering.py |  20 ++--
 soweego/ingester/wikidata_bot.py  | 154 +++++++++++++++++---------
 soweego/linker/baseline.py        |   4 +-
 soweego/validator/checks.py       | 174 +++++++++++++++++++-----------
 soweego/wikidata/api_requests.py  |   5 +-
 5 files changed, 228 insertions(+), 129 deletions(-)

diff --git a/soweego/commons/data_gathering.py b/soweego/commons/data_gathering.py
index 15085635..30bdb752 100644
--- a/soweego/commons/data_gathering.py
+++ b/soweego/commons/data_gathering.py
@@ -20,7 +20,13 @@
 from sqlalchemy import or_
 from tqdm import tqdm
 
-from soweego.commons import constants, keys, target_database, text_utils, url_utils
+from soweego.commons import (
+    constants,
+    keys,
+    target_database,
+    text_utils,
+    url_utils,
+)
 from soweego.commons.db_manager import DBManager
 from soweego.importer import models
 from soweego.wikidata import api_requests, sparql_queries, vocabulary
@@ -395,15 +401,15 @@ def gather_wikidata_biodata(wikidata):
             # (non-lower, lower): take the lowercased one
             labels = {text_utils.normalize(label)[1] for label in parsed}
             # e.g., (P19, Q641, {'venezia', 'venice', ...})
-            wikidata[qid][keys.BIODATA].append(
-                (pid, v_qid, labels)
-            )
+            wikidata[qid][keys.BIODATA].append((pid, v_qid, labels))
         # If `parsed` is a tuple, we have a (timestamp, precision) date
         elif isinstance(parsed, tuple):
             timestamp, precision = parsed[0], parsed[1]
             # Get rid of time, useless
             timestamp = timestamp.split('T')[0]
-            wikidata[qid][keys.BIODATA].append((pid, f'{timestamp}/{precision}'))
+            wikidata[qid][keys.BIODATA].append(
+                (pid, f'{timestamp}/{precision}')
+            )
         else:
             wikidata[qid][keys.BIODATA].append((pid, parsed))
         total += 1
@@ -526,9 +532,7 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls):
             if ext_id is not None:
                 ext_ids_to_add.append((qid, pid, ext_id, tid,))
             else:
-                urls_to_add.append(
-                    (qid, vocabulary.EXACT_MATCH, url, tid,)
-                )
+                urls_to_add.append((qid, vocabulary.EXACT_MATCH, url, tid,))
     return (
         ext_ids_to_add,
         urls_to_add,
diff --git a/soweego/ingester/wikidata_bot.py b/soweego/ingester/wikidata_bot.py
index 30ad56f1..0065b2b7 100644
--- a/soweego/ingester/wikidata_bot.py
+++ b/soweego/ingester/wikidata_bot.py
@@ -112,8 +112,7 @@ def delete_cli(catalog, entity, invalid_identifiers, sandbox):
     """
     if sandbox:
         LOGGER.info(
-            'Running on the Wikidata sandbox item %s ...',
-            vocabulary.SANDBOX_2
+            'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2
         )
 
     delete_or_deprecate_identifiers(
@@ -141,8 +140,7 @@ def deprecate_cli(catalog, entity, invalid_identifiers, sandbox):
     """
     if sandbox:
         LOGGER.info(
-            'Running on the Wikidata sandbox item %s ...',
-            vocabulary.SANDBOX_2
+            'Running on the Wikidata sandbox item %s ...', vocabulary.SANDBOX_2
         )
 
     delete_or_deprecate_identifiers(
@@ -193,7 +191,7 @@ def identifiers_cli(catalog, entity, identifiers, sandbox):
     '--criterion',
     type=click.Choice(('links', 'bio')),
     help='Validation criterion used to generate STATEMENTS. '
-         'Same as the command passed to `python -m soweego sync`'
+    'Same as the command passed to `python -m soweego sync`',
 )
 @click.option(
     '-s',
@@ -235,9 +233,7 @@ def people_cli(catalog, statements, criterion, sandbox):
         edit_summary = None
 
     if sandbox:
-        LOGGER.info(
-            'Running on the Wikidata sandbox item %s ...', sandbox_item
-        )
+        LOGGER.info('Running on the Wikidata sandbox item %s ...', sandbox_item)
 
     stmt_reader = csv.reader(statements)
     for person, predicate, value, catalog_id in stmt_reader:
@@ -248,7 +244,7 @@ def people_cli(catalog, statements, criterion, sandbox):
             catalog_qid=catalog_qid,
             catalog_pid=catalog_pid,
             catalog_id=catalog_id,
-            edit_summary=edit_summary
+            edit_summary=edit_summary,
         )
 
 
@@ -295,8 +291,11 @@ def works_cli(catalog, statements, sandbox):
         _add_or_reference_works(
             (subject, predicate, person),
             heuristic,
-            catalog_qid, person_pid, person_id,
-            is_imdb=is_imdb, edit_summary=WORKS_SUMMARY
+            catalog_qid,
+            person_pid,
+            person_id,
+            is_imdb=is_imdb,
+            edit_summary=WORKS_SUMMARY,
         )
 
 
@@ -327,7 +326,8 @@ def add_identifiers(
         _add_or_reference(
             (subject, catalog_pid, tid,),
             heuristic,
-            edit_summary=IDENTIFIERS_SUMMARY)
+            edit_summary=IDENTIFIERS_SUMMARY,
+        )
 
 
 def add_people_statements(
@@ -368,7 +368,10 @@ def add_people_statements(
     for subject, predicate, value, catalog_id in statements:
         LOGGER.info(
             'Processing (%s, %s, %s, %s) statement ...',
-            subject, predicate, value, catalog_id
+            subject,
+            predicate,
+            value,
+            catalog_id,
         )
         actual_subject = subject if not sandbox else sandbox_item
         _add_or_reference(
@@ -377,7 +380,7 @@ def add_people_statements(
             catalog_qid=catalog_qid,
             catalog_pid=person_pid,
             catalog_id=catalog_id,
-            edit_summary=edit_summary
+            edit_summary=edit_summary,
         )
 
 
@@ -407,7 +410,10 @@ def add_works_statements(
     for work, predicate, person, person_id in statements:
         LOGGER.info(
             'Processing (%s, %s, %s, %s) statement',
-            work, predicate, person, person_id
+            work,
+            predicate,
+            person,
+            person_id,
         )
         subject = work if not sandbox else sandbox_item
         _add_or_reference_works(
@@ -417,7 +423,7 @@ def add_works_statements(
             person_pid,
             person_id,
             is_imdb=is_imdb,
-            edit_summary=WORKS_SUMMARY
+            edit_summary=WORKS_SUMMARY,
         )
 
 
@@ -456,15 +462,23 @@ def delete_or_deprecate_identifiers(
             _delete_or_deprecate(action, actual_qid, tid, catalog, catalog_pid)
 
 
-def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str, catalog_pid: str, catalog_id: str,
-                            is_imdb=False, edit_summary=None) -> None:
+def _add_or_reference_works(
+    statement: tuple,
+    heuristic: str,
+    catalog_qid: str,
+    catalog_pid: str,
+    catalog_id: str,
+    is_imdb=False,
+    edit_summary=None,
+) -> None:
     work, predicate, person = statement
     # Parse value into an item in case of QID
     qid = match(QID_REGEX, person)
     if not qid:
         LOGGER.warning(
             "%s doesn't look like a QID, won't try to add the %s statement",
-            person, statement
+            person,
+            statement,
         )
         return
     person_item = pywikibot.ItemPage(REPO, qid.group())
@@ -490,7 +504,7 @@ def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str,
                 catalog_qid=catalog_qid,
                 catalog_pid=catalog_pid,
                 catalog_id=catalog_id,
-                edit_summary=edit_summary
+                edit_summary=edit_summary,
             ):
                 return
 
@@ -508,15 +522,21 @@ def _add_or_reference_works(statement: tuple, heuristic: str, catalog_qid: str,
 
 
 def _add_or_reference(
-        statement, heuristic,
-        catalog_qid=None, catalog_pid=None, catalog_id=None,
-        edit_summary=None
+    statement,
+    heuristic,
+    catalog_qid=None,
+    catalog_pid=None,
+    catalog_id=None,
+    edit_summary=None,
 ) -> None:
     subject, predicate, value = statement
     subject_item, claims = _essential_checks(
-        statement, heuristic,
-        catalog_qid=catalog_qid, catalog_pid=catalog_pid, catalog_id=catalog_id,
-        edit_summary=edit_summary
+        statement,
+        heuristic,
+        catalog_qid=catalog_qid,
+        catalog_pid=catalog_pid,
+        catalog_id=catalog_id,
+        edit_summary=edit_summary,
     )
 
     if None in (subject_item, claims):
@@ -533,7 +553,7 @@ def _add_or_reference(
         edit_summary=edit_summary,
         catalog_qid=catalog_qid,
         catalog_pid=catalog_pid,
-        catalog_id=catalog_id
+        catalog_id=catalog_id,
     ):
         return
 
@@ -554,7 +574,7 @@ def _add_or_reference(
         catalog_qid=catalog_qid,
         catalog_pid=catalog_pid,
         catalog_id=catalog_id,
-        edit_summary=edit_summary
+        edit_summary=edit_summary,
     )
 
 
@@ -577,12 +597,14 @@ def _handle_addition(
     if not given_predicate_claims:
         LOGGER.debug('%s has no %s claim', subject_qid, predicate)
         _add(
-            subject_item, predicate, value,
+            subject_item,
+            predicate,
+            value,
             heuristic,
             catalog_qid=catalog_qid,
             catalog_pid=catalog_pid,
             catalog_id=catalog_id,
-            edit_summary=edit_summary
+            edit_summary=edit_summary,
         )
         return
 
@@ -605,12 +627,14 @@ def _handle_addition(
             '%s has no %s claim with value %s', subject_qid, predicate, value
         )
         _add(
-            subject_item, predicate, value,
+            subject_item,
+            predicate,
+            value,
             heuristic,
             catalog_qid=catalog_qid,
             catalog_pid=catalog_pid,
             catalog_id=catalog_id,
-            edit_summary=edit_summary
+            edit_summary=edit_summary,
         )
         return
 
@@ -621,12 +645,26 @@ def _handle_addition(
     if case_insensitive:
         for claim in given_predicate_claims:
             if claim.getTarget().lower() == value:
-                _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary)
+                _reference(
+                    claim,
+                    heuristic,
+                    catalog_qid,
+                    catalog_pid,
+                    catalog_id,
+                    edit_summary=edit_summary,
+                )
                 return
 
     for claim in given_predicate_claims:
         if claim.getTarget() == value:
-            _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary)
+            _reference(
+                claim,
+                heuristic,
+                catalog_qid,
+                catalog_pid,
+                catalog_id,
+                edit_summary=edit_summary,
+            )
 
 
 def _handle_redirect_and_dead(qid):
@@ -662,11 +700,14 @@ def _essential_checks(
     if not data:
         LOGGER.warning('%s has no data at all', subject)
         _add(
-            item, predicate, value, heuristic,
+            item,
+            predicate,
+            value,
+            heuristic,
             catalog_qid=catalog_qid,
             catalog_pid=catalog_pid,
             catalog_id=catalog_id,
-            edit_summary=edit_summary
+            edit_summary=edit_summary,
         )
         return None, None
 
@@ -675,11 +716,14 @@ def _essential_checks(
     if not claims:
         LOGGER.warning('%s has no claims', subject)
         _add(
-            item, predicate, value, heuristic,
+            item,
+            predicate,
+            value,
+            heuristic,
             catalog_qid=catalog_qid,
             catalog_pid=catalog_pid,
             catalog_id=catalog_id,
-            edit_summary=edit_summary
+            edit_summary=edit_summary,
         )
         return None, None
 
@@ -707,11 +751,12 @@ def _check_for_same_value(
                     value,
                 )
                 _reference(
-                    claim, heuristic,
+                    claim,
+                    heuristic,
                     catalog_qid=catalog_qid,
                     catalog_pid=catalog_pid,
                     catalog_id=catalog_id,
-                    edit_summary=edit_summary
+                    edit_summary=edit_summary,
                 )
                 return True
     return False
@@ -762,15 +807,26 @@ def _add(
     claim.setTarget(value)
     subject_item.addClaim(claim, summary=edit_summary)
     LOGGER.debug('Added claim: %s', claim.toJSON())
-    _reference(claim, heuristic, catalog_qid, catalog_pid, catalog_id, edit_summary=edit_summary)
+    _reference(
+        claim,
+        heuristic,
+        catalog_qid,
+        catalog_pid,
+        catalog_id,
+        edit_summary=edit_summary,
+    )
     LOGGER.info(
         'Added (%s, %s, %s) statement', subject_item.getID(), predicate, value
     )
 
 
 def _reference(
-    claim: pywikibot.Claim, heuristic: str,
-    catalog_qid=None, catalog_pid=None, catalog_id=None, edit_summary=None
+    claim: pywikibot.Claim,
+    heuristic: str,
+    catalog_qid=None,
+    catalog_pid=None,
+    catalog_id=None,
+    edit_summary=None,
 ):
     reference_node, log_buffer = [], []
 
@@ -783,9 +839,7 @@ def _reference(
     based_on_heuristic_reference = pywikibot.Claim(
         REPO, vocabulary.BASED_ON_HEURISTIC, is_reference=True
     )
-    based_on_heuristic_reference.setTarget(
-        pywikibot.ItemPage(REPO, heuristic)
-    )
+    based_on_heuristic_reference.setTarget(pywikibot.ItemPage(REPO, heuristic))
     reference_node.append(based_on_heuristic_reference)
     log_buffer.append(f'({based_on_heuristic_reference.getID()}, {heuristic})')
 
@@ -801,7 +855,9 @@ def _reference(
 
     if catalog_pid is not None and catalog_id is not None:
         # (catalog property, catalog ID) reference claim
-        catalog_id_reference = pywikibot.Claim(REPO, catalog_pid, is_reference=True)
+        catalog_id_reference = pywikibot.Claim(
+            REPO, catalog_pid, is_reference=True
+        )
         catalog_id_reference.setTarget(catalog_id)
         reference_node.append(catalog_id_reference)
         log_buffer.append(f'({catalog_pid}, {catalog_id})')
@@ -821,9 +877,7 @@ def _reference(
         claim.addSources(reference_node, summary=edit_summary)
         LOGGER.info('Added %s reference node', log_msg)
     except (APIError, Error,) as error:
-        LOGGER.warning(
-            'Could not add %s reference node: %s', log_msg, error
-        )
+        LOGGER.warning('Could not add %s reference node: %s', log_msg, error)
 
 
 def _delete_or_deprecate(action, qid, tid, catalog, catalog_pid) -> None:
diff --git a/soweego/linker/baseline.py b/soweego/linker/baseline.py
index 3ae0ab44..c67bd25f 100644
--- a/soweego/linker/baseline.py
+++ b/soweego/linker/baseline.py
@@ -272,9 +272,7 @@ def _handle_result(
                 to_upload.add(statement)
 
     if upload:
-        wikidata_bot.add_people_statements(
-            catalog, to_upload, 'links', sandbox
-        )
+        wikidata_bot.add_people_statements(catalog, to_upload, 'links', sandbox)
 
     LOGGER.info('%s %s dumped to %s', catalog, origin, path_out)
 
diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index ff967438..7ba5167d 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -20,10 +20,16 @@
 import click
 from sqlalchemy.exc import SQLAlchemyError
 
-from soweego.commons import constants, data_gathering, keys, target_database, text_utils
+from soweego.commons import (
+    constants,
+    data_gathering,
+    keys,
+    target_database,
+    text_utils,
+)
 from soweego.commons.db_manager import DBManager
 from soweego.ingester import wikidata_bot
-from soweego.wikidata import vocabulary, api_requests
+from soweego.wikidata import api_requests, vocabulary
 from soweego.wikidata.api_requests import get_url_blacklist
 
 LOGGER = logging.getLogger(__name__)
@@ -31,7 +37,9 @@
 # File name templates
 # For all CLIs
 WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl'
-IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json'
+IDS_TO_BE_DEPRECATED_FNAME = (
+    '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json'
+)
 SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv'
 # For `dead_ids_cli`
 DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json'
@@ -39,7 +47,9 @@
 EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv'
 URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv'
 # For `bio_cli`
-BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv'
+BIO_STATEMENTS_TO_BE_ADDED_FNAME = (
+    '{catalog}_{entity}_bio_statements_to_be_added.csv'
+)
 
 
 @click.command()
@@ -84,9 +94,10 @@ def dead_ids_cli(catalog, entity, deprecate, sandbox, dump_wikidata, dir_io):
         dir_io, DEAD_IDS_FNAME.format(catalog=catalog, entity=entity)
     )
     wd_cache_path = os.path.join(
-        dir_io, WD_CACHE_FNAME.format(
+        dir_io,
+        WD_CACHE_FNAME.format(
             catalog=catalog, entity=entity, criterion='dead_ids'
-        )
+        ),
     )
 
     # Handle Wikidata cache
@@ -190,34 +201,31 @@ def links_cli(
     criterion = 'links'
     # Output paths
     deprecate_path = os.path.join(
-        dir_io, IDS_TO_BE_DEPRECATED_FNAME.format(
+        dir_io,
+        IDS_TO_BE_DEPRECATED_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
-        )
+        ),
     )
     add_ext_ids_path = os.path.join(
-        dir_io, EXT_IDS_FNAME.format(
-            catalog=catalog, entity=entity, task='added'
-        )
+        dir_io,
+        EXT_IDS_FNAME.format(catalog=catalog, entity=entity, task='added'),
     )
     add_urls_path = os.path.join(
-        dir_io, URLS_FNAME.format(
-            catalog=catalog, entity=entity, task='added'
-        )
+        dir_io, URLS_FNAME.format(catalog=catalog, entity=entity, task='added')
     )
     ref_ext_ids_path = os.path.join(
-        dir_io, EXT_IDS_FNAME.format(
-            catalog=catalog, entity=entity, task='referenced'
-        )
+        dir_io,
+        EXT_IDS_FNAME.format(catalog=catalog, entity=entity, task='referenced'),
     )
     ref_urls_path = os.path.join(
-        dir_io, URLS_FNAME.format(
-            catalog=catalog, entity=entity, task='referenced'
-        )
+        dir_io,
+        URLS_FNAME.format(catalog=catalog, entity=entity, task='referenced'),
     )
     wd_cache_path = os.path.join(
-        dir_io, WD_CACHE_FNAME.format(
+        dir_io,
+        WD_CACHE_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
-        )
+        ),
     )
 
     # Handle Wikidata cache
@@ -230,9 +238,14 @@ def links_cli(
             catalog, entity, blacklist, wd_cache=wd_cache
         )
     else:
-        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links(
-            catalog, entity, blacklist
-        )
+        (
+            deprecate,
+            add_ext_ids,
+            add_urls,
+            ref_ext_ids,
+            ref_urls,
+            wd_cache,
+        ) = links(catalog, entity, blacklist)
 
     # Nothing to do: the catalog doesn't contain links
     if deprecate is None:
@@ -240,9 +253,13 @@ def links_cli(
 
     # Dump output files
     _dump_deprecated(deprecate, deprecate_path)
-    _dump_csv_output(add_ext_ids, add_ext_ids_path, 'third-party IDs to be added')
+    _dump_csv_output(
+        add_ext_ids, add_ext_ids_path, 'third-party IDs to be added'
+    )
     _dump_csv_output(add_urls, add_urls_path, 'URLs to be added')
-    _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced')
+    _dump_csv_output(
+        ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced'
+    )
     _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced')
 
     # Dump Wikidata cache
@@ -263,7 +280,7 @@ def links_cli(
         if sandbox:
             LOGGER.info(
                 'Running on the Wikidata sandbox item %s ...',
-                vocabulary.SANDBOX_2
+                vocabulary.SANDBOX_2,
             )
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
         wikidata_bot.delete_or_deprecate_identifiers(
@@ -277,7 +294,9 @@ def links_cli(
         wikidata_bot.add_people_statements(
             catalog, add_urls, criterion, sandbox
         )
-        LOGGER.info('Starting referencing of shared external IDs in Wikidata ...')
+        LOGGER.info(
+            'Starting referencing of shared external IDs in Wikidata ...'
+        )
         wikidata_bot.add_people_statements(
             catalog, add_ext_ids, criterion, sandbox
         )
@@ -333,24 +352,26 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     """
     criterion = 'bio'
     deprecate_path = os.path.join(
-        dir_io, IDS_TO_BE_DEPRECATED_FNAME.format(
+        dir_io,
+        IDS_TO_BE_DEPRECATED_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
-        )
+        ),
     )
     add_path = os.path.join(
-        dir_io, BIO_STATEMENTS_TO_BE_ADDED_FNAME.format(
-            catalog=catalog, entity=entity
-        )
+        dir_io,
+        BIO_STATEMENTS_TO_BE_ADDED_FNAME.format(catalog=catalog, entity=entity),
     )
     ref_path = os.path.join(
-        dir_io, SHARED_STATEMENTS_FNAME.format(
+        dir_io,
+        SHARED_STATEMENTS_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
-        )
+        ),
     )
     wd_cache_path = os.path.join(
-        dir_io, WD_CACHE_FNAME.format(
+        dir_io,
+        WD_CACHE_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
-        )
+        ),
     )
 
     # Handle Wikidata cache
@@ -380,7 +401,8 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
                 # version should be the most efficient solution
                 pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
             LOGGER.info(
-                'Biographical data  gathered from Wikidata dumped to %s', wd_cache_path
+                'Biographical data  gathered from Wikidata dumped to %s',
+                wd_cache_path,
             )
         except MemoryError:
             LOGGER.warning('Could not pickle the Wikidata cache: memory error')
@@ -391,16 +413,14 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
         if sandbox:
             LOGGER.info(
                 'Running on the Wikidata sandbox item %s ...',
-                vocabulary.SANDBOX_2
+                vocabulary.SANDBOX_2,
             )
         LOGGER.info('Starting deprecation of %s IDs ...', catalog)
         wikidata_bot.delete_or_deprecate_identifiers(
             'deprecate', catalog, entity, deprecate, sandbox
         )
         LOGGER.info('Starting addition of extra statements to Wikidata ...')
-        wikidata_bot.add_people_statements(
-            catalog, add, criterion, sandbox
-        )
+        wikidata_bot.add_people_statements(catalog, add, criterion, sandbox)
         LOGGER.info('Starting referencing of shared statements in Wikidata ...')
         wikidata_bot.add_people_statements(
             catalog, reference, criterion, sandbox
@@ -488,7 +508,10 @@ def dead_ids(
 
 def links(
     catalog: str, entity: str, url_blacklist=False, wd_cache=None
-) -> Union[Tuple[defaultdict, list, list, list, list, dict], Tuple[None, None, None, None, None, None]]:
+) -> Union[
+    Tuple[defaultdict, list, list, list, list, dict],
+    Tuple[None, None, None, None, None, None],
+]:
     """Validate identifiers against available links.
 
     Also generate statements based on additional links
@@ -533,7 +556,11 @@ def links(
     if target_links is None:
         return None, None, None, None, None, None
 
-    deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set)
+    deprecate, add, reference = (
+        defaultdict(set),
+        defaultdict(set),
+        defaultdict(set),
+    )
 
     # Wikidata side
     url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
@@ -575,10 +602,13 @@ def links(
         'URL statements to be added: %d',
         'Third-party IDs to be referenced: %d. '
         'URL statements to be referenced: %d',
-        catalog, entity,
+        catalog,
+        entity,
         len(deprecate),
-        len(add_ext_ids), len(add_urls),
-        len(ref_ext_ids), len(ref_urls)
+        len(add_ext_ids),
+        len(add_urls),
+        len(ref_ext_ids),
+        len(ref_urls),
     )
 
     return deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_links
@@ -586,7 +616,9 @@ def links(
 
 def bio(
     catalog: str, entity: str, wd_cache=None
-) -> Union[Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None]]:
+) -> Union[
+    Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None]
+]:
     """Validate identifiers against available biographical data.
 
     Look for:
@@ -631,7 +663,11 @@ def bio(
     if target_bio is None:
         return None, None, None, None
 
-    deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set)
+    deprecate, add, reference = (
+        defaultdict(set),
+        defaultdict(set),
+        defaultdict(set),
+    )
 
     # Wikidata side
     if wd_cache is None:
@@ -647,13 +683,14 @@ def bio(
         wd_bio = wd_cache
 
     # Validation
-    _validate(
-        keys.BIODATA,
-        wd_bio, target_bio,
-        deprecate, add, reference
-    )
+    _validate(keys.BIODATA, wd_bio, target_bio, deprecate, add, reference)
 
-    return deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), wd_bio
+    return (
+        deprecate,
+        _bio_statements_generator(add),
+        _bio_statements_generator(reference),
+        wd_bio,
+    )
 
 
 def _apply_url_blacklist(url_statements):
@@ -665,9 +702,7 @@ def _apply_url_blacklist(url_statements):
     # Expected order of magnitude: n = 10^2; m = 10^5
     for domain in blacklist:  # 10^2
         url_statements = list(  # Slurp the filter or it won't work
-            filter(
-                lambda stmt: domain not in stmt[2], url_statements  # 10^5
-            )
+            filter(lambda stmt: domain not in stmt[2], url_statements)  # 10^5
         )
 
     LOGGER.info(
@@ -775,10 +810,13 @@ def _compute_shared_and_extra(criterion, wd_data, target_data):
         # No cast to `set` because `wd_data` triples hold sets themselves
         wd_other = list(filter(lambda x: len(x) == 3, wd_data))
         # In `target_data` we look for relevant date PIDs
-        target_dates = set(filter(
-            lambda x: x[0] in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH),
-            target_data
-        ))
+        target_dates = set(
+            filter(
+                lambda x: x[0]
+                in (vocabulary.DATE_OF_BIRTH, vocabulary.DATE_OF_DEATH),
+                target_data,
+            )
+        )
         target_other = target_data.difference(target_dates)
         shared_dates, extra_dates = _compare('dates', wd_dates, target_dates)
         shared_other, extra_other = _compare('other', wd_other, target_other)
@@ -820,8 +858,14 @@ def _compare(what, wd, target):
                 continue
 
             inputs = (
-                shared, extra, wd_matches, target_matches,
-                i, wd_elem, j, t_elem
+                shared,
+                extra,
+                wd_matches,
+                target_matches,
+                i,
+                wd_elem,
+                j,
+                t_elem,
             )
             if what == 'dates':
                 _compare_dates(inputs)
diff --git a/soweego/wikidata/api_requests.py b/soweego/wikidata/api_requests.py
index fb98dc8f..8b1a5ad8 100644
--- a/soweego/wikidata/api_requests.py
+++ b/soweego/wikidata/api_requests.py
@@ -51,7 +51,7 @@ def resolve_qid(term: str, language='en') -> Optional[str]:
         'action': 'wbsearchentities',
         'format': 'json',
         'search': term,
-        'language': language
+        'language': language,
     }
     response_body = _make_request(params)
 
@@ -70,8 +70,7 @@ def resolve_qid(term: str, language='en') -> Optional[str]:
     # No search results
     except IndexError:
         LOGGER.info(
-            "No QIDs found for search term '%s' (language: %s)",
-            term, language
+            "No QIDs found for search term '%s' (language: %s)", term, language
         )
         return None