From 135e9211d98650421d120678df470e7fe7122a2d Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Fri, 13 Aug 2021 10:28:37 +0000
Subject: [PATCH 1/9] [WIP] start work on #418

---
 soweego/validator/checks.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index ff967438..0e544445 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -10,6 +10,7 @@
 __copyright__ = 'Copyleft 2021, Hjfocs'
 
 import csv
+import gzip
 import json
 import logging
 import os
@@ -38,8 +39,10 @@
 # For `links_cli`
 EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv'
 URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv'
+WD_URLS_FNAME = 'wikidata_urls_for_{catalog}_{entity}.txt.gz'
 # For `bio_cli`
 BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv'
+WD_STATEMENTS_FNAME = 'wikidata_statements_for_{catalog}_{entity}.csv.gz'
 
 
 @click.command()
@@ -168,7 +171,7 @@ def links_cli(
 ):
     """Validate identifiers against links.
 
-    Dump 5 output files:
+    Dump 6 output files:
 
     1. catalog IDs to be deprecated. JSON format:
     {catalog_ID: [list of QIDs]}
@@ -183,6 +186,9 @@ def links_cli(
 
     5. URLs to be referenced. Same format as file #3
 
+    6. URLs found in Wikidata but not in the target catalog.
+    GZIP text format, one URL per line
+
     You can pass the '-u' flag to upload the output to Wikidata.
 
     The '-b' flag applies a URL blacklist of low-quality Web domains to file #3.
@@ -214,6 +220,11 @@ def links_cli(
             catalog=catalog, entity=entity, task='referenced'
         )
     )
+    wd_urls_path = os.path.join(
+        dir_io, WD_URLS_FNAME.format(
+            catalog=catalog, entity=entity
+        )
+    )
     wd_cache_path = os.path.join(
         dir_io, WD_CACHE_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
@@ -230,6 +241,7 @@ def links_cli(
             catalog, entity, blacklist, wd_cache=wd_cache
         )
     else:
+        # FIXME add `wd_urls` arg
         deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links(
             catalog, entity, blacklist
         )
@@ -244,6 +256,8 @@ def links_cli(
     _dump_csv_output(add_urls, add_urls_path, 'URLs to be added')
     _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced')
     _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced')
+    with gzip.open(wd_urls_path, 'wt') as gzout:
+        gzout.writelines([url + '\n' for url in wd_urls])
 
     # Dump Wikidata cache
     if dump_wikidata:

From ef5def00625be380402296c5a0a32be563fcaddd Mon Sep 17 00:00:00 2001
From: Marco Fossati <fossati@spaziodati.eu>
Date: Mon, 16 Aug 2021 15:11:57 +0200
Subject: [PATCH 2/9] dump Wikidata values not in the target catalog, closes
 #418

---
 soweego/validator/checks.py | 191 +++++++++++++++++++++++-------------
 1 file changed, 121 insertions(+), 70 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 0e544445..4425079a 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -10,7 +10,6 @@
 __copyright__ = 'Copyleft 2021, Hjfocs'
 
 import csv
-import gzip
 import json
 import logging
 import os
@@ -173,21 +172,23 @@ def links_cli(
 
     Dump 6 output files:
 
-    1. catalog IDs to be deprecated. JSON format:
-    {catalog_ID: [list of QIDs]}
+    1. catalog IDs to be deprecated.
+    JSON format: {catalog_ID: [list of QIDs]}
 
-    2. third-party IDs to be added. CSV format:
-    QID,third-party_PID,third-party_ID,catalog_ID
+    2. third-party IDs to be added.
+    CSV format: QID,third-party_PID,third-party_ID,catalog_ID
 
-    3. URLs to be added. CSV format:
-    QID,P2888,URL,catalog_ID
+    3. URLs to be added.
+    CSV format: QID,P2888,URL,catalog_ID
 
-    4. third-party IDs to be referenced. Same format as file #2
+    4. third-party IDs to be referenced.
+    Same format as file #2
 
-    5. URLs to be referenced. Same format as file #3
+    5. URLs to be referenced.
+    Same format as file #3
 
     6. URLs found in Wikidata but not in the target catalog.
-    GZIP text format, one URL per line
+    CSV format: URL,QID
 
     You can pass the '-u' flag to upload the output to Wikidata.
 
@@ -237,12 +238,11 @@ def links_cli(
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
-        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, _ = links(
+        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, _ = links(
             catalog, entity, blacklist, wd_cache=wd_cache
         )
     else:
-        # FIXME add `wd_urls` arg
-        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_cache = links(
+        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = links(
             catalog, entity, blacklist
         )
 
@@ -256,8 +256,7 @@ def links_cli(
     _dump_csv_output(add_urls, add_urls_path, 'URLs to be added')
     _dump_csv_output(ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced')
     _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced')
-    with gzip.open(wd_urls_path, 'wt') as gzout:
-        gzout.writelines([url + '\n' for url in wd_urls])
+    _dump_csv_output(wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}')
 
     # Dump Wikidata cache
     if dump_wikidata:
@@ -333,15 +332,19 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
 
     Look for birth/death dates, birth/death places, gender.
 
-    Dump 3 output files:
+    Dump 4 output files:
 
-    1. catalog IDs to be deprecated. JSON format:
-    {catalog_ID: [list of QIDs]}
+    1. catalog IDs to be deprecated.
+    JSON format: {catalog_ID: [list of QIDs]}
 
-    2. statements to be added. CSV format:
-    QID,PID,value,catalog_ID
+    2. statements to be added.
+    CSV format: QID,PID,value,catalog_ID
 
-    3. shared statements to be referenced. Same format as file #2
+    3. shared statements to be referenced.
+    Same format as file #2
+
+    4. statements found in Wikidata but not in the target catalog.
+    CSV format: catalog_ID,PID,value,QID
 
     You can pass the '-u' flag to upload the output to Wikidata.
     """
@@ -361,6 +364,11 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
             catalog=catalog, entity=entity, criterion=criterion
         )
     )
+    wd_stmts_path = os.path.join(
+        dir_io, WD_STATEMENTS_FNAME.format(
+            catalog=catalog, entity=entity
+        )
+    )
     wd_cache_path = os.path.join(
         dir_io, WD_CACHE_FNAME.format(
             catalog=catalog, entity=entity, criterion=criterion
@@ -373,9 +381,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
-        deprecate, add, reference, _ = bio(catalog, entity, wd_cache=wd_cache)
+        deprecate, add, reference, wd_stmts, _ = bio(catalog, entity, wd_cache=wd_cache)
     else:
-        deprecate, add, reference, wd_cache = bio(catalog, entity)
+        deprecate, add, reference, wd_stmts, wd_cache = bio(catalog, entity)
 
     # Nothing to do: the catalog doesn't contain biographical data
     if deprecate is None:
@@ -385,6 +393,10 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     _dump_deprecated(deprecate, deprecate_path)
     _dump_csv_output(add, add_path, 'statements to be added')
     _dump_csv_output(reference, ref_path, 'shared statements to be referenced')
+    _dump_csv_output(
+        wd_stmts, wd_stmts_path,
+        f'statements in Wikidata but not in {catalog} {entity}'
+    )
 
     # Dump Wikidata cache
     if dump_wikidata:
@@ -502,7 +514,7 @@ def dead_ids(
 
 def links(
     catalog: str, entity: str, url_blacklist=False, wd_cache=None
-) -> Union[Tuple[defaultdict, list, list, list, list, dict], Tuple[None, None, None, None, None, None]]:
+) -> Union[Tuple[defaultdict, list, list, list, list, list, dict], Tuple[None, None, None, None, None, None, None]]:
     """Validate identifiers against available links.
 
     Also generate statements based on additional links
@@ -531,23 +543,25 @@ def links(
       of URL domains. Default: ``False``
     :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
       in a previous run. Default: ``None``
-    :return: ``tuple`` of 6 objects
+    :return: 7 objects
 
       1. ``dict`` of identifiers that should be deprecated
       2. ``list`` of third-party identifiers that should be added
       3. ``list`` of URLs that should be added
       4. ``list`` of third-party identifiers that should be referenced
       5. ``list`` of URLs that should be referenced
-      6. ``dict`` of links gathered from Wikidata
+      6. ``list`` of URLs found in Wikidata but not in the target catalog
+      7. ``dict`` of links gathered from Wikidata
 
     """
     # Target catalog side first:
     # enable early return in case of no target links
     target_links = data_gathering.gather_target_links(entity, catalog)
     if target_links is None:
-        return None, None, None, None, None, None
+        return None, None, None, None, None, None, None
 
-    deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set)
+    deprecate, add = defaultdict(set), defaultdict(set)
+    reference, wd_only = defaultdict(set), defaultdict(set)
 
     # Wikidata side
     url_pids, ext_id_pids_to_urls = data_gathering.gather_relevant_pids()
@@ -566,9 +580,12 @@ def links(
         wd_links = wd_cache
 
     # Validation
-    _validate(keys.LINKS, wd_links, target_links, deprecate, add, reference)
+    _validate(
+        keys.LINKS, wd_links, target_links,
+        deprecate, add, reference, wd_only
+    )
 
-    # Links to be added:
+    # URLs to be added:
     # 1. Separate external IDs from URLs
     add_ext_ids, add_urls = data_gathering.extract_ids_from_urls(
         add, ext_id_pids_to_urls
@@ -577,30 +594,46 @@ def links(
     if url_blacklist:
         add_urls = _apply_url_blacklist(add_urls)
 
-    # Links to be referenced: separate external IDs from URLs
+    # URLs to be referenced: separate external IDs from URLs
     ref_ext_ids, ref_urls = data_gathering.extract_ids_from_urls(
         reference, ext_id_pids_to_urls
     )
 
+    # Wikidata-only URLs: convert into a list of statements
+    wd_only_urls = []
+    for (qid, tid), urls in wd_only.items():
+        for url in urls:
+            wd_only_urls.append((tid, url, qid))
+
     LOGGER.info(
         'Validation completed. Target: %s %s. '
         'IDs to be deprecated: %d. '
         'Third-party IDs to be added: %d. '
-        'URL statements to be added: %d',
+        'URL statements to be added: %d. '
         'Third-party IDs to be referenced: %d. '
-        'URL statements to be referenced: %d',
+        'URL statements to be referenced: %d. '
+        'URL in Wikidata but not in the target: %d',
         catalog, entity,
         len(deprecate),
-        len(add_ext_ids), len(add_urls),
-        len(ref_ext_ids), len(ref_urls)
+        len(add_ext_ids),
+        len(add_urls),
+        len(ref_ext_ids),
+        len(ref_urls),
+        len(wd_only_urls)
     )
 
-    return deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_links
+    return (
+        deprecate,
+        add_ext_ids, add_urls,
+        ref_ext_ids, ref_urls,
+        wd_only_urls,
+        wd_links
+    )
 
 
 def bio(
     catalog: str, entity: str, wd_cache=None
-) -> Union[Tuple[defaultdict, Iterator, Iterator, dict], Tuple[None, None, None, None]]:
+) -> Union[Tuple[defaultdict, Iterator, Iterator, Iterator, dict], Tuple[None, None, None, None, None]]:
     """Validate identifiers against available biographical data.
 
     Look for:
@@ -631,21 +664,23 @@ def bio(
       A supported entity
     :param wd_cache: (optional) a ``dict`` of links gathered from Wikidata
       in a previous run
-    :return: a ``tuple`` of 4 objects
+    :return: 5 objects
 
       1. ``dict`` of identifiers that should be deprecated
       2. ``generator`` of statements that should be added
       3. ``generator`` of shared statements that should be referenced
-      4. ``dict`` of biographical data gathered from Wikidata
+      4. ``generator`` of statements found in Wikidata but not in the target catalog
+      5. ``dict`` of biographical data gathered from Wikidata
 
     """
     # Target catalog side first:
     # enable early return in case of no target data
     target_bio = data_gathering.gather_target_biodata(entity, catalog)
     if target_bio is None:
-        return None, None, None, None
+        return None, None, None, None, None
 
-    deprecate, add, reference = defaultdict(set), defaultdict(set), defaultdict(set)
+    deprecate, add = defaultdict(set), defaultdict(set)
+    reference, wd_only = defaultdict(set), defaultdict(set)
 
     # Wikidata side
     if wd_cache is None:
@@ -661,14 +696,16 @@ def bio(
         wd_bio = wd_cache
 
     # Validation
-    _validate(
-        keys.BIODATA,
-        wd_bio, target_bio,
-        deprecate, add, reference
+    _validate(keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only)
+
+    return (
+        deprecate,
+        _bio_statements_generator(add),
+        _bio_statements_generator(reference),
+        _bio_statements_generator(wd_only, qid_first=False),
+        wd_bio
     )
 
-    return deprecate, _bio_statements_generator(add), _bio_statements_generator(reference), wd_bio
-
 
 def _apply_url_blacklist(url_statements):
     LOGGER.info('Applying URL blacklist ...')
@@ -691,13 +728,16 @@ def _apply_url_blacklist(url_statements):
     return url_statements
 
 
-def _bio_statements_generator(stmts_dict):
+def _bio_statements_generator(stmts_dict, qid_first=True):
     for (qid, tid), values in stmts_dict.items():
         for pid, value in values:
-            yield qid, pid, value, tid
+            if qid_first:
+                yield qid, pid, value, tid
+            else:
+                yield tid, pid, value, qid
 
 
-def _validate(criterion, wd, target_generator, deprecate, add, reference):
+def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_only):
     LOGGER.info('Starting check against target %s ...', criterion)
     target = _consume_target_generator(target_generator)
 
@@ -732,11 +772,11 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference):
                     )
                     continue
 
-                shared_data, extra_data = _compute_shared_and_extra(
+                shared_set, extra_set, wd_only_set = _compute_comparison_sets(
                     criterion, wd_data, target_data
                 )
 
-                if not shared_data:
+                if not shared_set:
                     LOGGER.debug(
                         'No shared %s between %s and %s. The identifier '
                         'statement should be deprecated',
@@ -751,37 +791,42 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference):
                         qid,
                         tid,
                         criterion,
-                        shared_data,
+                        shared_set,
                     )
-                    reference[(qid, tid)].update(shared_data)
+                    reference[(qid, tid)].update(shared_set)
 
-                if extra_data:
+                if extra_set:
                     LOGGER.debug(
                         '%s has extra %s that should be added to %s: %s',
-                        tid,
-                        criterion,
-                        qid,
-                        extra_data,
+                        tid, criterion, qid, extra_set
                     )
-                    add[(qid, tid)].update(extra_data)
+                    add[(qid, tid)].update(extra_set)
                 else:
                     LOGGER.debug('%s has no extra %s', tid, criterion)
 
+                if wd_only_set:
+                    LOGGER.debug('%s has %s not in %s: %s', qid, criterion, tid, wd_only_set)
+                    wd_only[(qid, tid)].update(wd_only_set)
+                else:
+                    LOGGER.debug('%s has no extra %s', qid, criterion)
+
     LOGGER.info(
         'Check against target %s completed: %d IDs to be deprecated, '
-        '%d Wikidata items with statements to be added, ',
-        '%d Wikidata items with shared statements to be referenced',
-        criterion,
-        len(deprecate),
+        '%d Wikidata items with statements to be added, '
+        '%d Wikidata items with shared statements to be referenced, '
+        '%d values in Wikidata but not in the target catalog',
+        criterion, len(deprecate),
         len(add),
         len(reference),
+        len(wd_only)
     )
 
 
-def _compute_shared_and_extra(criterion, wd_data, target_data):
+def _compute_comparison_sets(criterion, wd_data, target_data):
     if criterion == keys.LINKS:
         shared = wd_data.intersection(target_data)
         extra = target_data.difference(wd_data)
+        wd_only = wd_data.difference(target_data)
     # Biographical validation requires more complex comparisons
     elif criterion == keys.BIODATA:
         # `wd_data` has either couples or triples: couples are dates
@@ -795,16 +840,22 @@ def _compute_shared_and_extra(criterion, wd_data, target_data):
         ))
         target_other = target_data.difference(target_dates)
         shared_dates, extra_dates = _compare('dates', wd_dates, target_dates)
+        wd_only_dates = wd_dates.difference(shared_dates)
         shared_other, extra_other = _compare('other', wd_other, target_other)
+        # `wd_other` has triples: build a set with couples
+        # to directly compute the difference with `shared_other`
+        wd_other_set = {(pid, qid) for pid, qid, _ in wd_other}
+        wd_only_other = wd_other_set.difference(shared_other)
         shared = shared_dates | shared_other
         extra = extra_dates | extra_other
+        wd_only = wd_only_dates | wd_only_other
     else:
         raise ValueError(
             f"Invalid validation criterion: '{criterion}'. "
             f"Please use either '{keys.LINKS}' or '{keys.BIODATA}'"
         )
 
-    return shared, extra
+    return shared, extra, wd_only
 
 
 def _compare(what, wd, target):
@@ -935,12 +986,12 @@ def _dump_deprecated(data, outpath):
         LOGGER.info("No IDs to be deprecated, won't dump to file")
 
 
-def _dump_csv_output(data, outpath, log_msg_subject):
+def _dump_csv_output(data, out_path, log_msg_subject):
     if data:
-        with open(outpath, 'w') as ids_out:
-            writer = csv.writer(ids_out)
+        with open(out_path, 'w') as fout:
+            writer = csv.writer(fout)
             writer.writerows(data)
-        LOGGER.info('%s dumped to %s', log_msg_subject, outpath)
+        LOGGER.info('%s dumped to %s', log_msg_subject, out_path)
     else:
         LOGGER.info("No %s, won't dump to file", log_msg_subject)
 

From 0dc09dea6321ad718a530413f1d1c23cfdc4234d Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Tue, 17 Aug 2021 10:22:51 +0000
Subject: [PATCH 3/9] refactor WD-only statements file name

---
 soweego/validator/checks.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 4425079a..1f5c00c9 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -33,15 +33,14 @@
 WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl'
 IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json'
 SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv'
+WD_STATEMENTS_FNAME = 'wikidata_{criterion}_for_{catalog}_{entity}.csv'
 # For `dead_ids_cli`
 DEAD_IDS_FNAME = '{catalog}_{entity}_dead_ids.json'
 # For `links_cli`
 EXT_IDS_FNAME = '{catalog}_{entity}_external_ids_to_be_{task}.csv'
 URLS_FNAME = '{catalog}_{entity}_urls_to_be_{task}.csv'
-WD_URLS_FNAME = 'wikidata_urls_for_{catalog}_{entity}.txt.gz'
 # For `bio_cli`
 BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv'
-WD_STATEMENTS_FNAME = 'wikidata_statements_for_{catalog}_{entity}.csv.gz'
 
 
 @click.command()
@@ -222,8 +221,8 @@ def links_cli(
         )
     )
     wd_urls_path = os.path.join(
-        dir_io, WD_URLS_FNAME.format(
-            catalog=catalog, entity=entity
+        dir_io, WD_STATEMENTS_FNAME.format(
+            criterion=criterion, catalog=catalog, entity=entity
         )
     )
     wd_cache_path = os.path.join(
@@ -366,7 +365,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     )
     wd_stmts_path = os.path.join(
         dir_io, WD_STATEMENTS_FNAME.format(
-            catalog=catalog, entity=entity
+            criterion=criterion, catalog=catalog, entity=entity
         )
     )
     wd_cache_path = os.path.join(
@@ -406,7 +405,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
                 # version should be the most efficient solution
                 pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
             LOGGER.info(
-                'Biographical data  gathered from Wikidata dumped to %s', wd_cache_path
+                'Biographical data gathered from Wikidata dumped to %s', wd_cache_path
             )
         except MemoryError:
             LOGGER.warning('Could not pickle the Wikidata cache: memory error')

From 9f30cd9213d337e136ae5191b0030748e4dcc88d Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Tue, 17 Aug 2021 16:48:12 +0000
Subject: [PATCH 4/9] avoid useless computation: pre-filter missing non-date
 target values

---
 soweego/validator/checks.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 1f5c00c9..68f536fe 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -830,7 +830,7 @@ def _compute_comparison_sets(criterion, wd_data, target_data):
     elif criterion == keys.BIODATA:
         # `wd_data` has either couples or triples: couples are dates
         wd_dates = set(filter(lambda x: len(x) == 2, wd_data))
-        # No cast to `set` because `wd_data` triples hold sets themselves
+        # Don't cast to set: `wd_data` triples hold sets themselves
         wd_other = list(filter(lambda x: len(x) == 3, wd_data))
         # In `target_data` we look for relevant date PIDs
         target_dates = set(filter(
@@ -865,6 +865,10 @@ def _compare(what, wd, target):
     # the same property
     wd_matches, target_matches = [], []
 
+    # Filter missing target values (doesn't apply to dates)
+    if what == 'other':
+        target = filter(lambda x: x[1] is not None, target)
+
     for i, wd_elem in enumerate(wd):
         for j, t_elem in enumerate(target):
             # Don't compare when already matched
@@ -875,19 +879,18 @@ def _compare(what, wd, target):
             if wd_elem[0] != t_elem[0]:
                 continue
 
-            # Skip unexpected `None` values
-            if None in (wd_elem[1], t_elem[1]):
-                LOGGER.warning(
-                    'Skipping unexpected %s pair with missing value(s)',
-                    (wd_elem, t_elem),
-                )
-                continue
-
             inputs = (
                 shared, extra, wd_matches, target_matches,
                 i, wd_elem, j, t_elem
             )
             if what == 'dates':
+                # Missing dates are unexpected: skip but warn
+                if None in (wd_elem[1], t_elem[1]):
+                    LOGGER.warning(
+                        'Skipping unexpected %s date pair with missing value(s)',
+                        (wd_elem, t_elem),
+                    )
+                    continue
                 _compare_dates(inputs)
             elif what == 'other':
                 _compare_other(inputs)

From 6e39c34d0cd14b73ef2c85743739883537ba6803 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Wed, 18 Aug 2021 10:54:24 +0000
Subject: [PATCH 5/9] dump full Wikidata URLs for catalog providers

---
 soweego/validator/checks.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 68f536fe..bce5655b 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -15,6 +15,7 @@
 import os
 import pickle
 from collections import defaultdict
+from re import match
 from typing import DefaultDict, Dict, Iterator, Tuple, Union
 
 import click
@@ -32,6 +33,7 @@
 # For all CLIs
 WD_CACHE_FNAME = '{catalog}_{entity}_{criterion}_wd_cache.pkl'
 IDS_TO_BE_DEPRECATED_FNAME = '{catalog}_{entity}_{criterion}_ids_to_be_deprecated.json'
+# For `links_cli` and `bio_cli`
 SHARED_STATEMENTS_FNAME = '{catalog}_{entity}_{criterion}_shared_statements.csv'
 WD_STATEMENTS_FNAME = 'wikidata_{criterion}_for_{catalog}_{entity}.csv'
 # For `dead_ids_cli`
@@ -42,6 +44,10 @@
 # For `bio_cli`
 BIO_STATEMENTS_TO_BE_ADDED_FNAME = '{catalog}_{entity}_bio_statements_to_be_added.csv'
 
+# URL prefixes for catalog providers
+QID_PREFIX = 'https://www.wikidata.org/wiki/'
+PID_PREFIX = QID_PREFIX + 'Property:'
+
 
 @click.command()
 @click.argument(
@@ -187,7 +193,7 @@ def links_cli(
     Same format as file #3
 
     6. URLs found in Wikidata but not in the target catalog.
-    CSV format: URL,QID
+    CSV format: catalog_ID,URL,QID_URL
 
     You can pass the '-u' flag to upload the output to Wikidata.
 
@@ -343,7 +349,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     Same format as file #2
 
     4. statements found in Wikidata but not in the target catalog.
-    CSV format: catalog_ID,PID,value,QID
+    CSV format: catalog_ID,PID_URL,value,QID_URL
 
     You can pass the '-u' flag to upload the output to Wikidata.
     """
@@ -599,10 +605,11 @@ def links(
     )
 
     # Wikidata-only URLs: convert into a list of statements
+    # with complete Wikidata item URLs
     wd_only_urls = []
     for (qid, tid), urls in wd_only.items():
         for url in urls:
-            wd_only_urls.append((tid, url, qid))
+            wd_only_urls.append((tid, url, QID_PREFIX + qid))
 
     LOGGER.info(
         'Validation completed. Target: %s %s. '
@@ -701,7 +708,7 @@ def bio(
         deprecate,
         _bio_statements_generator(add),
         _bio_statements_generator(reference),
-        _bio_statements_generator(wd_only, qid_first=False),
+        _bio_statements_generator(wd_only, for_catalogs=True),
         wd_bio
     )
 
@@ -727,13 +734,15 @@ def _apply_url_blacklist(url_statements):
     return url_statements
 
 
-def _bio_statements_generator(stmts_dict, qid_first=True):
+def _bio_statements_generator(stmts_dict, for_catalogs=False):
     for (qid, tid), values in stmts_dict.items():
         for pid, value in values:
-            if qid_first:
+            if not for_catalogs:
                 yield qid, pid, value, tid
             else:
-                yield tid, pid, value, qid
+                if match(constants.QID_REGEX, value):
+                    value = QID_PREFIX + value
+                yield tid, PID_PREFIX + pid, value, QID_PREFIX + qid
 
 
 def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_only):

From 2a8811a528b5299edb484b1346963f95d227d8c3 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Fri, 20 Aug 2021 14:48:32 +0000
Subject: [PATCH 6/9] fix bad merge conflict resolution

---
 soweego/validator/checks.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 4950c7fb..32916dec 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -255,14 +255,9 @@ def links_cli(
             catalog, entity, blacklist, wd_cache=wd_cache
         )
     else:
-        (
-            deprecate,
-            add_ext_ids,
-            add_urls,
-            ref_ext_ids,
-            ref_urls,
-            wd_cache,
-        ) = links(catalog, entity, blacklist)
+        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = links(
+            catalog, entity, blacklist
+        )
 
     # Nothing to do: the catalog doesn't contain links
     if deprecate is None:

From 6a02411712c388fcc63863e1c13f706bcb565791 Mon Sep 17 00:00:00 2001
From: travis <travis@soweego>
Date: Mon, 23 Aug 2021 11:01:59 +0000
Subject: [PATCH 7/9] format code & organize imports

---
 soweego/validator/checks.py | 93 +++++++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 29 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index 32916dec..d6950cf3 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -234,9 +234,10 @@ def links_cli(
         URLS_FNAME.format(catalog=catalog, entity=entity, task='referenced'),
     )
     wd_urls_path = os.path.join(
-        dir_io, WD_STATEMENTS_FNAME.format(
+        dir_io,
+        WD_STATEMENTS_FNAME.format(
             criterion=criterion, catalog=catalog, entity=entity
-        )
+        ),
     )
     wd_cache_path = os.path.join(
         dir_io,
@@ -251,13 +252,25 @@ def links_cli(
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
-        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, _ = links(
-            catalog, entity, blacklist, wd_cache=wd_cache
-        )
+        (
+            deprecate,
+            add_ext_ids,
+            add_urls,
+            ref_ext_ids,
+            ref_urls,
+            wd_urls,
+            _,
+        ) = links(catalog, entity, blacklist, wd_cache=wd_cache)
     else:
-        deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = links(
-            catalog, entity, blacklist
-        )
+        (
+            deprecate,
+            add_ext_ids,
+            add_urls,
+            ref_ext_ids,
+            ref_urls,
+            wd_urls,
+            wd_cache,
+        ) = links(catalog, entity, blacklist)
 
     # Nothing to do: the catalog doesn't contain links
     if deprecate is None:
@@ -273,7 +286,9 @@ def links_cli(
         ref_ext_ids, ref_ext_ids_path, 'shared third-party IDs to be referenced'
     )
     _dump_csv_output(ref_urls, ref_urls_path, 'shared URLs to be referenced')
-    _dump_csv_output(wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}')
+    _dump_csv_output(
+        wd_urls, wd_urls_path, f'Wikidata URLs not in {catalog} {entity}'
+    )
 
     # Dump Wikidata cache
     if dump_wikidata:
@@ -385,9 +400,10 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
         ),
     )
     wd_stmts_path = os.path.join(
-        dir_io, WD_STATEMENTS_FNAME.format(
+        dir_io,
+        WD_STATEMENTS_FNAME.format(
             criterion=criterion, catalog=catalog, entity=entity
-        )
+        ),
     )
     wd_cache_path = os.path.join(
         dir_io,
@@ -402,7 +418,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
         # Discard the last return value: Wikidata cache
-        deprecate, add, reference, wd_stmts, _ = bio(catalog, entity, wd_cache=wd_cache)
+        deprecate, add, reference, wd_stmts, _ = bio(
+            catalog, entity, wd_cache=wd_cache
+        )
     else:
         deprecate, add, reference, wd_stmts, wd_cache = bio(catalog, entity)
 
@@ -415,8 +433,9 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     _dump_csv_output(add, add_path, 'statements to be added')
     _dump_csv_output(reference, ref_path, 'shared statements to be referenced')
     _dump_csv_output(
-        wd_stmts, wd_stmts_path,
-        f'statements in Wikidata but not in {catalog} {entity}'
+        wd_stmts,
+        wd_stmts_path,
+        f'statements in Wikidata but not in {catalog} {entity}',
     )
 
     # Dump Wikidata cache
@@ -428,7 +447,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
                 pickle.dump(wd_cache, cout, protocol=pickle.HIGHEST_PROTOCOL)
             LOGGER.info(
                 'Biographical data gathered from Wikidata dumped to %s',
-                wd_cache_path
+                wd_cache_path,
             )
         except MemoryError:
             LOGGER.warning('Could not pickle the Wikidata cache: memory error')
@@ -601,8 +620,7 @@ def links(
 
     # Validation
     _validate(
-        keys.LINKS, wd_links, target_links,
-        deprecate, add, reference, wd_only
+        keys.LINKS, wd_links, target_links, deprecate, add, reference, wd_only
     )
 
     # URLs to be added:
@@ -634,21 +652,24 @@ def links(
         'Third-party IDs to be referenced: %d. '
         'URL statements to be referenced: %d. '
         'URL in Wikidata but not in the target: %d',
-        catalog, entity,
+        catalog,
+        entity,
         len(deprecate),
         len(add_ext_ids),
         len(add_urls),
         len(ref_ext_ids),
         len(ref_urls),
-        len(wd_only_urls)
+        len(wd_only_urls),
     )
 
     return (
         deprecate,
-        add_ext_ids, add_urls,
-        ref_ext_ids, ref_urls,
+        add_ext_ids,
+        add_urls,
+        ref_ext_ids,
+        ref_urls,
         wd_only_urls,
-        wd_links
+        wd_links,
     )
 
 
@@ -717,14 +738,16 @@ def bio(
         wd_bio = wd_cache
 
     # Validation
-    _validate(keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only)
+    _validate(
+        keys.BIODATA, wd_bio, target_bio, deprecate, add, reference, wd_only
+    )
 
     return (
         deprecate,
         _bio_statements_generator(add),
         _bio_statements_generator(reference),
         _bio_statements_generator(wd_only, for_catalogs=True),
-        wd_bio
+        wd_bio,
     )
 
 
@@ -758,7 +781,9 @@ def _bio_statements_generator(stmts_dict, for_catalogs=False):
                 yield tid, PID_PREFIX + pid, value, QID_PREFIX + qid
 
 
-def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_only):
+def _validate(
+    criterion, wd, target_generator, deprecate, add, reference, wd_only
+):
     LOGGER.info('Starting check against target %s ...', criterion)
     target = _consume_target_generator(target_generator)
 
@@ -819,14 +844,23 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_onl
                 if extra_set:
                     LOGGER.debug(
                         '%s has extra %s that should be added to %s: %s',
-                        tid, criterion, qid, extra_set
+                        tid,
+                        criterion,
+                        qid,
+                        extra_set,
                     )
                     add[(qid, tid)].update(extra_set)
                 else:
                     LOGGER.debug('%s has no extra %s', tid, criterion)
 
                 if wd_only_set:
-                    LOGGER.debug('%s has %s not in %s: %s', qid, criterion, tid, wd_only_set)
+                    LOGGER.debug(
+                        '%s has %s not in %s: %s',
+                        qid,
+                        criterion,
+                        tid,
+                        wd_only_set,
+                    )
                     wd_only[(qid, tid)].update(wd_only_set)
                 else:
                     LOGGER.debug('%s has no extra %s', qid, criterion)
@@ -836,10 +870,11 @@ def _validate(criterion, wd, target_generator, deprecate, add, reference, wd_onl
         '%d Wikidata items with statements to be added, '
         '%d Wikidata items with shared statements to be referenced, '
         '%d values in Wikidata but not in the target catalog',
-        criterion, len(deprecate),
+        criterion,
+        len(deprecate),
         len(add),
         len(reference),
-        len(wd_only)
+        len(wd_only),
     )
 
 

From 6aca8b9457acac3703b10dfc11e8d1f992359db6 Mon Sep 17 00:00:00 2001
From: Marco at VPS <fossati@spaziodati.eu>
Date: Thu, 2 Sep 2021 16:31:48 +0000
Subject: [PATCH 8/9] return a single None to simplify code and comply with
 type hints. Tackles #428#discussion_r699422874

---
 soweego/validator/checks.py | 61 ++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index d6950cf3..aa7fbdca 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -246,36 +246,22 @@ def links_cli(
         ),
     )
 
-    # Handle Wikidata cache
+    # Wikidata cache
+    wd_cache = None
     if os.path.isfile(wd_cache_path):
         with open(wd_cache_path, 'rb') as cin:
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
-        # Discard the last return value: Wikidata cache
-        (
-            deprecate,
-            add_ext_ids,
-            add_urls,
-            ref_ext_ids,
-            ref_urls,
-            wd_urls,
-            _,
-        ) = links(catalog, entity, blacklist, wd_cache=wd_cache)
-    else:
-        (
-            deprecate,
-            add_ext_ids,
-            add_urls,
-            ref_ext_ids,
-            ref_urls,
-            wd_urls,
-            wd_cache,
-        ) = links(catalog, entity, blacklist)
+
+    # Run validation
+    result = links(catalog, entity, url_blacklist=blacklist, wd_cache=wd_cache)
 
     # Nothing to do: the catalog doesn't contain links
-    if deprecate is None:
+    if result is None:
         return
 
+    # Unpack the result tuple
+    deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = result
     # Dump output files
     _dump_deprecated(deprecate, deprecate_path)
     _dump_csv_output(
@@ -383,6 +369,7 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
     You can pass the '-u' flag to upload the output to Wikidata.
     """
     criterion = 'bio'
+    # Output paths
     deprecate_path = os.path.join(
         dir_io,
         IDS_TO_BE_DEPRECATED_FNAME.format(
@@ -412,22 +399,22 @@ def bio_cli(catalog, entity, upload, sandbox, dump_wikidata, dir_io):
         ),
     )
 
-    # Handle Wikidata cache
+    # Wikidata cache
+    wd_cache = None
     if os.path.isfile(wd_cache_path):
         with open(wd_cache_path, 'rb') as cin:
             wd_cache = pickle.load(cin)
         LOGGER.info("Loaded Wikidata cache from '%s'", cin.name)
-        # Discard the last return value: Wikidata cache
-        deprecate, add, reference, wd_stmts, _ = bio(
-            catalog, entity, wd_cache=wd_cache
-        )
-    else:
-        deprecate, add, reference, wd_stmts, wd_cache = bio(catalog, entity)
+
+    # Run validation
+    result = bio(catalog, entity, wd_cache=wd_cache)
 
     # Nothing to do: the catalog doesn't contain biographical data
-    if deprecate is None:
+    if result is None:
         return
 
+    # Unpack the result tuple
+    deprecate, add, reference, wd_stmts, wd_cache = result
     # Dump output files
     _dump_deprecated(deprecate, deprecate_path)
     _dump_csv_output(add, add_path, 'statements to be added')
@@ -557,12 +544,12 @@ def links(
     """Validate identifiers against available links.
 
     Also generate statements based on additional links
-    found in the given catalog.
+    found in the target catalog.
     They can be used to enrich Wikidata items.
 
     **How it works:**
 
-    1. gather links from the given catalog
+    1. gather links from the target catalog
     2. gather links from relevant Wikidata items
     3. look for shared links between pairs of Wikidata and catalog items:
 
@@ -592,12 +579,14 @@ def links(
       6. ``list`` of URLs found in Wikidata but not in the target catalog
       7. ``dict`` of links gathered from Wikidata
 
+      or ``None`` if the target catalog has no links.
+
     """
     # Target catalog side first:
     # enable early return in case of no target links
     target_links = data_gathering.gather_target_links(entity, catalog)
     if target_links is None:
-        return None, None, None, None, None, None, None
+        return None
 
     deprecate, add = defaultdict(set), defaultdict(set)
     reference, wd_only = defaultdict(set), defaultdict(set)
@@ -685,7 +674,7 @@ def bio(
     - gender
 
     Also generate statements based on additional data
-    found in the given catalog.
+    found in the target catalog.
     They can be used to enrich Wikidata items.
 
     **How it works:**
@@ -714,12 +703,14 @@ def bio(
       4. ``generator`` of statements found in Wikidata but not in the target catalog
       5. ``dict`` of biographical data gathered from Wikidata
 
+      or ``None`` if the target catalog has no biographical data.
+
     """
     # Target catalog side first:
     # enable early return in case of no target data
     target_bio = data_gathering.gather_target_biodata(entity, catalog)
     if target_bio is None:
-        return None, None, None, None, None
+        return None
 
     deprecate, add = defaultdict(set), defaultdict(set)
     reference, wd_only = defaultdict(set), defaultdict(set)

From 678995e13b1ff24d0674c6c8f2e8b70fed0fefc5 Mon Sep 17 00:00:00 2001
From: travis <travis@soweego>
Date: Thu, 2 Sep 2021 16:35:09 +0000
Subject: [PATCH 9/9] format code & organize imports

---
 soweego/validator/checks.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/soweego/validator/checks.py b/soweego/validator/checks.py
index aa7fbdca..d5cfac6e 100644
--- a/soweego/validator/checks.py
+++ b/soweego/validator/checks.py
@@ -261,7 +261,15 @@ def links_cli(
         return
 
     # Unpack the result tuple
-    deprecate, add_ext_ids, add_urls, ref_ext_ids, ref_urls, wd_urls, wd_cache = result
+    (
+        deprecate,
+        add_ext_ids,
+        add_urls,
+        ref_ext_ids,
+        ref_urls,
+        wd_urls,
+        wd_cache,
+    ) = result
     # Dump output files
     _dump_deprecated(deprecate, deprecate_path)
     _dump_csv_output(