Skip to content

Commit

Permalink
Do not reuse uri of dataset entities
Browse files Browse the repository at this point in the history
- makes sure that setting a different mapping for different vocabs, using the same dataset, the mappings will not be UNIONed, but work as intended

- before, the deletion of one vocab might delete the unified entities of another vocab, of the same dataset was used. This fixes the bug caused by this.
  • Loading branch information
Ruben authored and x-m-el committed Apr 22, 2024
1 parent a24a814 commit 89e29a5
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 62 deletions.
4 changes: 4 additions & 0 deletions config/resources/dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@
"source-dataset": {
"predicate": "dct:source",
"type": "url"
},
"dataset-entity-uri": {
"type": "url",
"predicate": "prov:wasDerivedFrom"
}
},
"features": ["include-uri"],
Expand Down
2 changes: 2 additions & 0 deletions config/search/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
"type": "language-string"
},
"sourceDataset": "http://purl.org/dc/terms/source",
"datasetEntityUri": "http://www.w3.org/ns/prov#wasDerivedFrom",
"vocabulary": [
"http://purl.org/dc/terms/source",
"^http://mu.semte.ch/vocabularies/ext/sourceDataset"
Expand All @@ -55,6 +56,7 @@
"mappings": {
"properties": {
"sourceDataset": { "type": "keyword" },
"datasetEntityUri": {"type": "keyword"},
"vocabulary": {"type": "keyword" },
"prefLabel": {
"properties": {
Expand Down
96 changes: 40 additions & 56 deletions services/content-unification/unification.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,7 @@
from escape_helpers import sparql_escape_uri, sparql_escape_datetime, sparql_escape_string

MU_APPLICATION_GRAPH = os.environ.get("MU_APPLICATION_GRAPH")


def unify_from_node_shape(node_shape, source_dataset, metadata_graph, source_graph, target_graph):
query_template = Template("""
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX dct: <http://purl.org/dc/terms/>
INSERT {
GRAPH $target_graph {
?s
a ?destClass ;
?destPath ?sourceValue .
?s dct:source $source_dataset .
}
}
WHERE {
GRAPH $metadata_graph {
$node_shape
a sh:NodeShape ;
sh:targetClass ?sourceClass ;
sh:property ?propertyShape .
?propertyShape
a sh:PropertyShape ;
sh:description ?destPath ;
sh:path ?pathString .
BIND(URI(?pathString) AS ?sourcePath)
BIND(skos:Concept as ?destClass)
}
GRAPH $source_graph {
?s
a ?sourceClass ;
?sourcePath ?sourceValue .
}
}""")
query_string = query_template.substitute(
target_graph=sparql_escape_uri(target_graph),
source_dataset=sparql_escape_uri(source_dataset),
metadata_graph=sparql_escape_uri(metadata_graph),
source_graph=sparql_escape_uri(source_graph),
node_shape=sparql_escape_uri(node_shape)
)
return query_string

NEW_SUBJECT_BASE = "http://example-resource.com/dataset-subject/"

def get_property_paths(node_shape, metadata_graph):
query_template = Template("""
Expand Down Expand Up @@ -76,6 +32,19 @@ def get_property_paths(node_shape, metadata_graph):
return query_string


# Unification works as follows:
# the Subject from the source is taken based on the provided class (Pivot Type)
# and predicate path (Label/Tag path) given by the user
# this source is added to every provided dataset with a constructed URI (?internalSubject)
# that is unique per vocab (?vocabUri) and dataset subject (?sourceSubject)
# prov:wasDerivedFrom connects the original dataset's URI
# Note that this constructed URI is only needed for internal use:
# this avoids reusing URIs for the same dataset subjects over multiple vocabs (which might use the same dataset source)
# however, the user is only interested in the actual dataset uri (via prov:wasDerivedFrom)

# A unified entity is connected to a vocab via one (or more) datasets, but in reality a unified entity
# is part of a vocabulary (a vocab has one "unified dataset"), not part of "multiple" datasets.
# As long as a concept is connected to one dataset of the vocab, the search will find it back.
def get_ununified_batch(dest_class,
dest_predicate,
source_datasets,
Expand All @@ -86,28 +55,36 @@ def get_ununified_batch(dest_class,
batch_size):
query_template = Template("""
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX ext: <http://mu.semte.ch/vocabularies/ext/>
CONSTRUCT {
?s a $dest_class .
?s $dest_predicate ?sourceValue .
?s dct:source ?sourceDataset .
?internalSubject
prov:wasDerivedFrom ?sourceSubject ;
a $dest_class ;
$dest_predicate ?sourceValue ;
dct:source ?sourceDataset .
}
WHERE {
VALUES ?sourceDataset {
$source_datasets
}
?vocabUri ext:sourceDataset ?sourceDataset .
FILTER NOT EXISTS {
GRAPH $target_graph {
?s
?targetSubject
prov:wasDerivedFrom ?sourceSubject ;
a $dest_class ;
$dest_predicate ?sourceValue .
}
}
GRAPH $source_graph {
?s
?sourceSubject
a $source_class ;
$source_path_string ?sourceValue .
}
BIND(IRI(CONCAT($new_subject_uri_base, MD5(CONCAT(str(?vocabUri), str(?sourceSubject))))) as ?internalSubject)
}
LIMIT $batch_size
""")
Expand All @@ -116,23 +93,30 @@ def get_ununified_batch(dest_class,
dest_predicate=sparql_escape_uri(dest_predicate),
source_datasets="\n ".join([sparql_escape_uri(source_dataset) for source_dataset in source_datasets]),
source_class=sparql_escape_uri(source_class),
source_path_string=source_path_string, # !
source_path_string=source_path_string, # !this is already formatted as a sparql predicate path by the frontend.
source_graph=sparql_escape_uri(source_graph),
target_graph=sparql_escape_uri(target_graph),
batch_size=batch_size
batch_size=batch_size,
new_subject_uri_base=sparql_escape_string(NEW_SUBJECT_BASE)
)
return query_string


def delete_from_graph(subjects, graph):
# delete the subjects provided that are part of a dataset
# note that these are related to the subject uris in our internal app via prov:wasDerivedFrom,
# see `get_ununified_batch` function for details
def delete_dataset_subjects_from_graph(subjects, graph):
query_template = Template("""
PREFIX prov: <http://www.w3.org/ns/prov#>
WITH $graph
DELETE {
?s ?p ?o .
?internalSubject prov:wasDerivedFrom ?datasetSubject .
?internalSubject ?p ?o .
}
WHERE {
?s ?p ?o .
VALUES ?s {
?internalSubject prov:wasDerivedFrom ?datasetSubject .
?internalSubject ?p ?o .
VALUES ?datasetSubject {
$subjects
}
}
Expand Down
11 changes: 5 additions & 6 deletions services/content-unification/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@
from dataset import get_dataset

from unification import (
unify_from_node_shape,
get_property_paths,
get_ununified_batch,
delete_from_graph,
delete_dataset_subjects_from_graph,
)
from remove_vocab import (
remove_files,
Expand Down Expand Up @@ -74,7 +73,7 @@ def run_vocab_unification(vocab_uri):
print(dataset_versions)
# TODO: LDES check
if "data_dump" in dataset_versions[0].keys():
new_temp_named_graph = load_file_to_db(
temp_named_graph = load_file_to_db(
dataset_versions[0]["data_dump"]["value"], VOCAB_GRAPH, temp_named_graph
)
if len(dataset_versions) > 1: # previous dumps exist
Expand All @@ -85,9 +84,9 @@ def run_vocab_unification(vocab_uri):
# as sorted ntriples files, this can be done on file basis. Would improve perf
# and avoid having to load everything to triplestore with python rdflib store
# as an intermediary (!)
diff_subjects = diff_graphs(old_temp_named_graph, new_temp_named_graph)
diff_subjects = diff_graphs(old_temp_named_graph, temp_named_graph)
for diff_subjects_batch in batched(diff_subjects, 10):
query_sudo(delete_from_graph(diff_subjects_batch, VOCAB_GRAPH))
query_sudo(delete_dataset_subjects_from_graph(diff_subjects_batch, VOCAB_GRAPH))
drop_graph(old_temp_named_graph)
else:
# since we now also save ldes datasets to files, ldes datasets can also get
Expand Down Expand Up @@ -147,8 +146,8 @@ def delete_vocabulary(vocab_uuid: str):
update_sudo(query_string)
else:
break
# todo: these job deletions are not yet adjusted to the new Jobs structure (which use data containers)
update_sudo(remove_vocab_vocab_fetch_jobs(vocab_uuid, VOCAB_GRAPH))

update_sudo(remove_vocab_vocab_unification_jobs(vocab_uuid, VOCAB_GRAPH))
update_sudo(remove_vocab_partitions(vocab_uuid, VOCAB_GRAPH))
update_sudo(remove_vocab_source_datasets(vocab_uuid, VOCAB_GRAPH))
Expand Down

0 comments on commit 89e29a5

Please sign in to comment.