Do not reuse uri of dataset entities

- makes sure that setting a different mapping for different vocabs, using the same dataset, the mappings will not be UNIONed, but work as intended - before, the deletion of one vocab might delete the unified entities of another vocab, of the same dataset was used. This fixes the bug caused by this.
redpencilio · Apr 22, 2024 · 89e29a5 · 89e29a5
1 parent a24a814
commit 89e29a5
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 62 deletions.
diff --git a/config/resources/dataset.json b/config/resources/dataset.json
@@ -108,6 +108,10 @@
         "source-dataset": {
           "predicate": "dct:source",
           "type": "url"
+        },
+        "dataset-entity-uri": {
+          "type": "url",
+          "predicate": "prov:wasDerivedFrom"
         }
       },
       "features": ["include-uri"],

diff --git a/config/search/config.json b/config/search/config.json
@@ -46,6 +46,7 @@
           "type": "language-string"
         },
         "sourceDataset": "http://purl.org/dc/terms/source",
+        "datasetEntityUri": "http://www.w3.org/ns/prov#wasDerivedFrom",
         "vocabulary": [
           "http://purl.org/dc/terms/source",
           "^http://mu.semte.ch/vocabularies/ext/sourceDataset"
@@ -55,6 +56,7 @@
       "mappings": {
         "properties": {
           "sourceDataset": { "type": "keyword" },
+          "datasetEntityUri": {"type": "keyword"},
           "vocabulary": {"type": "keyword" },
           "prefLabel": {
             "properties": {

diff --git a/services/content-unification/unification.py b/services/content-unification/unification.py
@@ -3,51 +3,7 @@
 from escape_helpers import sparql_escape_uri, sparql_escape_datetime, sparql_escape_string
 
 MU_APPLICATION_GRAPH = os.environ.get("MU_APPLICATION_GRAPH")
-
-
-def unify_from_node_shape(node_shape, source_dataset, metadata_graph, source_graph, target_graph):
-    query_template = Template("""
-PREFIX void: <http://rdfs.org/ns/void#>
-PREFIX sh: <http://www.w3.org/ns/shacl#>
-PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
-PREFIX dct: <http://purl.org/dc/terms/>
-
-INSERT {
-    GRAPH $target_graph {
-        ?s
-            a ?destClass ;
-            ?destPath ?sourceValue .
-        ?s dct:source $source_dataset .
-    }
-}
-WHERE {
-    GRAPH $metadata_graph {
-        $node_shape
-            a sh:NodeShape ;
-            sh:targetClass ?sourceClass ;
-            sh:property ?propertyShape .
-        ?propertyShape
-            a sh:PropertyShape ;
-            sh:description ?destPath ;
-            sh:path ?pathString .
-        BIND(URI(?pathString) AS ?sourcePath)
-        BIND(skos:Concept as ?destClass)
-    }
-    GRAPH $source_graph {
-        ?s
-            a ?sourceClass ;
-            ?sourcePath ?sourceValue .
-    }
-}""")
-    query_string = query_template.substitute(
-        target_graph=sparql_escape_uri(target_graph),
-        source_dataset=sparql_escape_uri(source_dataset),
-        metadata_graph=sparql_escape_uri(metadata_graph),
-        source_graph=sparql_escape_uri(source_graph),
-        node_shape=sparql_escape_uri(node_shape)
-    )
-    return query_string
-
+NEW_SUBJECT_BASE = "http://example-resource.com/dataset-subject/"
 
 def get_property_paths(node_shape, metadata_graph):
     query_template = Template("""
@@ -76,6 +32,19 @@ def get_property_paths(node_shape, metadata_graph):
     return query_string
 
 
+# Unification works as follows:
+# the Subject from the source is taken based on the provided class (Pivot Type)
+# and predicate path (Label/Tag path) given by the user
+# this source is added to every provided dataset with a constructed URI (?internalSubject)
+# that is unique per vocab (?vocabUri) and dataset subject (?sourceSubject)
+# prov:wasDerivedFrom connects the original dataset's URI
+# Note that this constructed URI is only needed for internal use:
+# this avoids reusing URIs for the same dataset subjects over multiple vocabs (which might use the same dataset source)
+# however, the user is only interested in the actual dataset uri (via prov:wasDerivedFrom)
+
+# A unified entity is connected to a vocab via one (or more) datasets, but in reality a unified entity
+# is part of a vocabulary (a vocab has one "unified dataset"), not part of "multiple" datasets. 
+# As long as a concept is connected to one dataset of the vocab, the search will find it back.
 def get_ununified_batch(dest_class,
                         dest_predicate,
                         source_datasets,
@@ -86,28 +55,36 @@ def get_ununified_batch(dest_class,
                         batch_size):
     query_template = Template("""
 PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX prov: <http://www.w3.org/ns/prov#>
+PREFIX ext: <http://mu.semte.ch/vocabularies/ext/>
 
 CONSTRUCT {
-    ?s a $dest_class .
-    ?s $dest_predicate ?sourceValue .
-    ?s dct:source ?sourceDataset .
+      ?internalSubject
+          prov:wasDerivedFrom ?sourceSubject ;
+          a $dest_class ;
+          $dest_predicate ?sourceValue ;
+          dct:source ?sourceDataset .
 }
 WHERE {
     VALUES ?sourceDataset {
         $source_datasets
     }
+    ?vocabUri ext:sourceDataset ?sourceDataset .
+    
     FILTER NOT EXISTS {
         GRAPH $target_graph {
-            ?s
+            ?targetSubject 
+                prov:wasDerivedFrom ?sourceSubject ;
                 a $dest_class ;
                 $dest_predicate ?sourceValue .
         }
     }
     GRAPH $source_graph {
-        ?s
+        ?sourceSubject
             a $source_class ;
             $source_path_string ?sourceValue .
     }
+    BIND(IRI(CONCAT($new_subject_uri_base, MD5(CONCAT(str(?vocabUri), str(?sourceSubject))))) as ?internalSubject)
 }
 LIMIT $batch_size
 """)
@@ -116,23 +93,30 @@ def get_ununified_batch(dest_class,
         dest_predicate=sparql_escape_uri(dest_predicate),
         source_datasets="\n         ".join([sparql_escape_uri(source_dataset) for source_dataset in source_datasets]),
         source_class=sparql_escape_uri(source_class),
-        source_path_string=source_path_string,  # !
+        source_path_string=source_path_string,  # !this is already formatted as a sparql predicate path by the frontend. 
         source_graph=sparql_escape_uri(source_graph),
         target_graph=sparql_escape_uri(target_graph),
-        batch_size=batch_size
+        batch_size=batch_size,
+        new_subject_uri_base=sparql_escape_string(NEW_SUBJECT_BASE)
     )
     return query_string
 
 
-def delete_from_graph(subjects, graph):
+# delete the subjects provided that are part of a dataset
+# note that these are related to the subject uris in our internal app via prov:wasDerivedFrom,
+# see `get_ununified_batch` function for details
+def delete_dataset_subjects_from_graph(subjects, graph):
     query_template = Template("""
+PREFIX prov: <http://www.w3.org/ns/prov#>
 WITH $graph
 DELETE {
-    ?s ?p ?o .
+    ?internalSubject prov:wasDerivedFrom ?datasetSubject .
+    ?internalSubject ?p ?o .
 }
 WHERE {
-    ?s ?p ?o .
-    VALUES ?s {
+    ?internalSubject prov:wasDerivedFrom ?datasetSubject .
+    ?internalSubject ?p ?o .
+    VALUES ?datasetSubject {
         $subjects
     }
 }

diff --git a/services/content-unification/web.py b/services/content-unification/web.py
@@ -29,10 +29,9 @@
 from dataset import get_dataset
 
 from unification import (
-    unify_from_node_shape,
     get_property_paths,
     get_ununified_batch,
-    delete_from_graph,
+    delete_dataset_subjects_from_graph,
 )
 from remove_vocab import (
     remove_files,
@@ -74,7 +73,7 @@ def run_vocab_unification(vocab_uri):
         print(dataset_versions)
         # TODO: LDES check
         if "data_dump" in dataset_versions[0].keys():
-            new_temp_named_graph = load_file_to_db(
+            temp_named_graph = load_file_to_db(
                 dataset_versions[0]["data_dump"]["value"], VOCAB_GRAPH, temp_named_graph
             )
             if len(dataset_versions) > 1:  # previous dumps exist
@@ -85,9 +84,9 @@ def run_vocab_unification(vocab_uri):
                 # as sorted ntriples files, this can be done on file basis. Would improve perf
                 # and avoid having to load everything to triplestore with python rdflib store
                 # as an intermediary (!)
-                diff_subjects = diff_graphs(old_temp_named_graph, new_temp_named_graph)
+                diff_subjects = diff_graphs(old_temp_named_graph, temp_named_graph)
                 for diff_subjects_batch in batched(diff_subjects, 10):
-                    query_sudo(delete_from_graph(diff_subjects_batch, VOCAB_GRAPH))
+                    query_sudo(delete_dataset_subjects_from_graph(diff_subjects_batch, VOCAB_GRAPH))
                 drop_graph(old_temp_named_graph)
         else:
             # since we now also save ldes datasets to files, ldes datasets can also get
@@ -147,8 +146,8 @@ def delete_vocabulary(vocab_uuid: str):
                 update_sudo(query_string)
         else:
             break
+    # todo: these job deletions are not yet adjusted to the new Jobs structure (which use data containers)      
     update_sudo(remove_vocab_vocab_fetch_jobs(vocab_uuid, VOCAB_GRAPH))
-
     update_sudo(remove_vocab_vocab_unification_jobs(vocab_uuid, VOCAB_GRAPH))
     update_sudo(remove_vocab_partitions(vocab_uuid, VOCAB_GRAPH))
     update_sudo(remove_vocab_source_datasets(vocab_uuid, VOCAB_GRAPH))