add support for multiple ontologies

vemonet · Dec 6, 2023 · 7224490 · 7224490
1 parent 9c03167
commit 7224490
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -18,10 +18,11 @@ Automatically generate descriptive [CSVW](https://csvw.org) (CSV on the Web) met
 - **Extract columns datatypes**: detect if they are categorical, and which values are accepted, using [`ydata-profiling`](https://github.com/ydataai/ydata-profiling).
 - **Ontology mappings**: when provided with a URL to an OWL ontology, text embeddings are generated and stored in a local [Qdrant](https://github.com/qdrant/qdrant) vector database for all classes and properties, we use similarity search to match each data column to the most relevant ontology terms.
 - Currently supports: CSV, Excel, SPSS files. Any format that can be loaded in a Pandas DataFrame could be easily added, create an issue on GitHub to request a new format to be added.
+    - Processed files needs to contain 1 sheet, if multiple sheets are present in a file only the first one will be processed.
 
 > [!WARNING]
 >
-> Processed files needs to contain 1 sheet, if multiple sheets are present in a file only the first one will be processed.
+> The lib does not check yet if the VectorDB has been fully loaded. It will skip loading if there is at least 2 vectors in the DB. So if you stop the loading process halfway through, you will need to delete the VectorDB folder to make sure the lib run the ontology loading.
 
 ## 📦️ Installation
 

diff --git a/src/csvw_ontomap/__main__.py b/src/csvw_ontomap/__main__.py
@@ -12,7 +12,7 @@
 @cli.command("profile")
 def cli_profile(
     files: List[str] = typer.Argument(None, help="Files to profile"),
-    ontology: str = typer.Option(None, "-m", help="URL to the OWL ontology to map the CSV columns to"),
+    ontologies: List[str] = typer.Option(None, "-m", help="URLs to the OWL ontologies to map the CSV columns to"),
     vectordb: str = typer.Option("data/vectordb", "-d", help="Path to the VectorDB"),
     best_matches: int = typer.Option(0, help="Number of best matches to add to each column as rdfs:comment"),
     threshold: float = typer.Option(
@@ -25,7 +25,7 @@ def cli_profile(
         comment_best_matches=best_matches,
         search_threshold=threshold,
     )
-    profiler = CsvwProfiler(ontology, vectordb, config)
+    profiler = CsvwProfiler(ontologies, vectordb, config)
     report = profiler.profile_files(files)
     if output:
         if verbose:

diff --git a/src/csvw_ontomap/ontology.py b/src/csvw_ontomap/ontology.py
@@ -1,5 +1,5 @@
 """Load and search an ontology with a vectordb."""
-from typing import Any
+from typing import Any, List
 
 from fastembed.embedding import FlagEmbedding as Embedding
 from owlready2 import get_ontology
@@ -18,44 +18,45 @@
 COLLECTION_NAME = "csvw-ontomap"
 
 
-def load_vectordb(ontology_url: str, vectordb_path: str) -> None:
-    print(f"📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")
-    onto = get_ontology(ontology_url).load()
-
+def load_vectordb(ontologies: List[str], vectordb_path: str, recreate: bool = False) -> None:
     # Initialize FastEmbed and Qdrant Client
     print("📥 Loading embedding model")
     embedding_model = Embedding(model_name=EMBEDDING_MODEL_NAME, max_length=512)
-
     vectordb = QdrantClient(path=vectordb_path)
 
     # Check if vectordb is already loaded
     try:
-        all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
         vectors_count = vectordb.get_collection(COLLECTION_NAME).points_count
-        print(
-            f"{all_onto_count} classes/properties in the ontology. And currently {BOLD}{vectors_count}{END} loaded in the VectorDB"
-        )
-        if vectors_count <= all_onto_count:
-            raise Exception("Not enough vectors.")
-    except Exception as e:
-        print(f"🔄 {e!s} Recreating VectorDB")
+        # all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
+        # print(f"{all_onto_count} classes/properties in the ontology. And currently {BOLD}{vectors_count}{END} loaded in the VectorDB")
+        if vectors_count <= 2:
+            raise Exception("Not enough vectors")
+    except Exception:
+        recreate = True
+    # TODO: for each ontology check if there are more vectors than classes/properties
+    # And skip building if enough vectors for this ontology
+    if recreate:
+        print(f"🔄 Recreating VectorDB in {vectordb_path}")
         vectordb.recreate_collection(
             collection_name=COLLECTION_NAME,
             vectors_config=VectorParams(size=EMBEDDING_MODEL_SIZE, distance=Distance.COSINE),
         )
+        for ontology_url in ontologies:
+            print(f"📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")
+            onto = get_ontology(ontology_url).load()
+            # Find labels, generate embeddings, and upload them
+            upload_concepts(onto.classes(), "class", ontology_url, vectordb, embedding_model)
+            upload_concepts(onto.properties(), "property", ontology_url, vectordb, embedding_model)
 
-        # Find labels, generate embeddings, and upload them
-        upload_concepts(onto.classes(), "class", vectordb, embedding_model)
-        upload_concepts(onto.properties(), "property", vectordb, embedding_model)
 
-
-def upload_concepts(onto_concepts: Any, category: str, vectordb: Any, embedding_model: Any) -> None:
+def upload_concepts(onto_concepts: Any, category: str, ontology_url: str, vectordb: Any, embedding_model: Any) -> None:
     """Generate and upload embeddings for label and description of a list of owlready2 classes/properties"""
-    print(f"🌀 Generating embeddings for {CYAN}{category}{END}")  # ⏳
+    print(f"⏳ Generating embeddings for {category}")
     concept_labels = []
     concept_uris = []
     for concept in onto_concepts:
         # print(f"Class URI: {ent.iri}, Label: {ent.label}, Description: {str(ent.description.first())}, Comment: {ent.comment}")
+        print(concept.label, concept.description, concept.name)
         if concept.label:
             concept_uris.append(concept.iri)
             concept_labels.append(str(concept.label.first()))
@@ -67,7 +68,11 @@ def upload_concepts(onto_concepts: Any, category: str, vectordb: Any, embedding_
     embeddings = list(embedding_model.embed(concept_labels))
     points_count: int = vectordb.get_collection(COLLECTION_NAME).points_count
     class_points = [
-        PointStruct(id=points_count + i, vector=embedding, payload={"id": uri, "label": label, "category": category})
+        PointStruct(
+            id=points_count + i,
+            vector=embedding,
+            payload={"id": uri, "label": label, "category": category, "ontology": ontology_url},
+        )
         for i, (uri, label, embedding) in enumerate(zip(concept_uris, concept_labels, embeddings))
     ]
     print(f"{BOLD}{len(class_points)}{END} vectors generated for {len(list(onto_concepts))} {category}")

diff --git a/src/csvw_ontomap/profiler.py b/src/csvw_ontomap/profiler.py
@@ -15,17 +15,18 @@
 class CsvwProfiler:
     def __init__(
         self,
-        ontology_url: Optional[str] = None,
+        ontologies: Optional[List[str]] = None,
         vectordb_path: str = "data/vectordb",
         config: Optional[OntomapConfig] = None,
+        recreate: bool = False,
     ) -> None:
         """Optionally provide an ontology that will be loaded to a vectordb for mapping"""
         self.config = config if config else OntomapConfig()
         self.csvw: Any = CSVW_BASE
-        self.ontology_url = ontology_url
+        self.ontologies = ontologies
         self.vectordb_path = vectordb_path
-        if self.ontology_url:
-            load_vectordb(self.ontology_url, self.vectordb_path)
+        if self.ontologies:
+            load_vectordb(self.ontologies, self.vectordb_path, recreate)
 
     def profile_files(self, files: List[str], config: Optional[OntomapConfig] = None) -> Any:
         """Profile a list of tabular files by generating report using https://github.com/ydataai/ydata-profiling
@@ -61,7 +62,7 @@ def profile_files(self, files: List[str], config: Optional[OntomapConfig] = None
                 for var_name, var_report in report["variables"].items():
                     col = {"titles": var_name, "dc:title": separate_words(var_name)}
 
-                    if self.ontology_url:
+                    if self.ontologies:
                         # Get most matching property or class from the ontology
                         matches = search_vectordb(self.vectordb_path, col["dc:title"], config.comment_best_matches)
                         if matches[0].score >= config.search_threshold:
@@ -170,6 +171,6 @@ def separate_words(input_string: str) -> str:
 CSVW_BASE = {
     "@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}],
     # "dc:title": "CSVW profiling report",
-    "dialect": {"header": True, "encoding": "utf-8"},
+    # "dialect": {"header": True, "encoding": "utf-8"},
     "tables": [],
 }
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
@@ -5,8 +5,11 @@
 
 from csvw_ontomap import CsvwProfiler, OntomapConfig, __version__
 
-ONTOLOGY_URL = "https://semanticscience.org/ontology/sio.owl"
-# ONTOLOGY_URL= "http://www.lesfleursdunormal.fr/static/_downloads/omop_cdm_v6.owl"
+ONTOLOGIES = [
+    # "https://semanticscience.org/ontology/sio.owl",
+    # "http://www.lesfleursdunormal.fr/static/_downloads/omop_cdm_v6.owl",
+    "data/LOINC.ttl",
+]
 
 
 def test_profiler():
@@ -18,14 +21,14 @@ def test_profiler():
 
 def test_profiler_with_ontology():
     """Test the Profiler with ontology"""
-    profiler = CsvwProfiler(ONTOLOGY_URL)
+    profiler = CsvwProfiler(ONTOLOGIES)
     csvw_report = profiler.profile_files(["tests/resources/heart.csv"])
     validate_csvw(csvw_report)
 
 
 def test_profiler_with_ontology_best_matches():
     """Test the Profiler with ontology and add best matches"""
-    profiler = CsvwProfiler(ONTOLOGY_URL, config=OntomapConfig(comment_best_matches=3))
+    profiler = CsvwProfiler(ONTOLOGIES, config=OntomapConfig(comment_best_matches=3))
     csvw_report = profiler.profile_files(["tests/resources/heart.csv"])
     validate_csvw(csvw_report)