Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validation criterion 3: biographical data #419

Merged
merged 23 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
31fab3a
update & refurbish CLI docstrings
marfox Aug 2, 2021
2d79ff2
add record linkage item to reference non-machine learning bot edits
marfox Aug 4, 2021
8952826
[WIP] start refactoring to parametrize the P887 heuristic
marfox Aug 4, 2021
17d6236
close #406 ; fix WD cache loading bug; don't remove dates from the or…
marfox Aug 4, 2021
b35e1ec
[WIP] huge refactoring
marfox Aug 5, 2021
41d4d30
pass catalog QID to works; improve CLI help; use sandbox 2; simplify …
marfox Aug 6, 2021
cb814da
handle edit summary in public function; use sandbox item 2 everywhere…
marfox Aug 9, 2021
46b8080
simpler although a bit more redundant WD upload code; update docstrin…
marfox Aug 9, 2021
df82da1
remove 2
marfox Aug 9, 2021
b37bad1
log an info msg when uploading to sandbox
marfox Aug 9, 2021
d3427d0
keep track of matching dates to avoid incorrect comparisons, closes #…
marfox Aug 10, 2021
2a90b0f
expect date strings in 'ISO-date/precision' format: avoid 01-01 preci…
marfox Aug 10, 2021
5fda9e8
[WIP] start work on issue #413
marfox Aug 10, 2021
00b52b7
[WIP] comparison logic for values other than dates; refactor comparison
marfox Aug 10, 2021
93bc7f4
feeling-lucky resolution of a QID given a term, closes #414
marfox Aug 12, 2021
5072c78
closes #413 ; closes #417 ; see #414
marfox Aug 12, 2021
cbc36b9
use P2888 for bare URLs statements, closes #409
marfox Aug 12, 2021
8205c80
validation order: deprecate, add, reference; links validation: dump s…
marfox Aug 12, 2021
20906aa
dump output first, then eventual cache; use highest pickle protocol; …
marfox Aug 13, 2021
39345a6
better type annotation & docstring
marfox Aug 18, 2021
fa16a4b
better type annotation & docstring again
marfox Aug 18, 2021
55a6d59
Merge fa16a4b45930d41da67f4e3fa7aa76c81c485589 into 8e0bf0c44617c791c…
marfox Aug 18, 2021
661e20b
format code & organize imports
Aug 18, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 26 additions & 19 deletions soweego/commons/data_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
from sqlalchemy import or_
from tqdm import tqdm

from soweego.commons import constants, keys, target_database, url_utils
from soweego.commons import (
constants,
keys,
target_database,
text_utils,
url_utils,
)
from soweego.commons.db_manager import DBManager
from soweego.importer import models
from soweego.wikidata import api_requests, sparql_queries, vocabulary
Expand Down Expand Up @@ -382,27 +388,30 @@ def gather_wikidata_biodata(wikidata):
for qid, pid, value in api_requests.get_biodata(wikidata.keys()):
parsed = api_requests.parse_value(value)
if not wikidata[qid].get(keys.BIODATA):
wikidata[qid][keys.BIODATA] = set()
# `parsed` is a set of labels if the value is a QID
# see api_requests.parse_value
wikidata[qid][keys.BIODATA] = []
# If `parsed` is a set, we have item labels,
# see `api_requests.parse_value` behavior
if isinstance(parsed, set):
# The English label for gender should be enough
gender = parsed & {keys.MALE, keys.FEMALE}
if gender:
wikidata[qid][keys.BIODATA].add((pid, gender.pop()))
else:
# Add a (pid, label) tuple for each element
# for better recall
for element in parsed:
wikidata[qid][keys.BIODATA].add((pid, element))
# `parsed` is a tuple (timestamp, precision) id the value is a date
# Keep track of the value QID
# Dict key checks are already done in `api_requests.parse_value`,
# so no need to redo it here
v_qid = value['id']
# Normalize & de-duplicate labels
# `text_utils.normalize` returns a tuple with two forms
# (non-lower, lower): take the lowercased one
labels = {text_utils.normalize(label)[1] for label in parsed}
# e.g., (P19, Q641, {'venezia', 'venice', ...})
wikidata[qid][keys.BIODATA].append((pid, v_qid, labels))
# If `parsed` is a tuple, we have a (timestamp, precision) date
elif isinstance(parsed, tuple):
timestamp, precision = parsed[0], parsed[1]
# Get rid of time, useless
timestamp = timestamp.split('T')[0]
wikidata[qid][keys.BIODATA].add((pid, f'{timestamp}/{precision}'))
wikidata[qid][keys.BIODATA].append(
(pid, f'{timestamp}/{precision}')
)
else:
wikidata[qid][keys.BIODATA].add((pid, parsed))
wikidata[qid][keys.BIODATA].append((pid, parsed))
total += 1

LOGGER.info('Got %d statements', total)
Expand Down Expand Up @@ -523,9 +532,7 @@ def extract_ids_from_urls(to_be_added, ext_id_pids_to_urls):
if ext_id is not None:
ext_ids_to_add.append((qid, pid, ext_id, tid,))
else:
urls_to_add.append(
(qid, vocabulary.DESCRIBED_AT_URL, url, tid,)
)
urls_to_add.append((qid, vocabulary.EXACT_MATCH, url, tid,))
return (
ext_ids_to_add,
urls_to_add,
Expand Down
Loading