Skip to content

Commit

Permalink
add support for multiple ontologies
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Dec 6, 2023
1 parent 9c03167 commit 7224490
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 34 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ Automatically generate descriptive [CSVW](https://csvw.org) (CSV on the Web) met
- **Extract columns datatypes**: detect if they are categorical, and which values are accepted, using [`ydata-profiling`](https://github.com/ydataai/ydata-profiling).
- **Ontology mappings**: when provided with a URL to an OWL ontology, text embeddings are generated and stored in a local [Qdrant](https://github.com/qdrant/qdrant) vector database for all classes and properties, we use similarity search to match each data column to the most relevant ontology terms.
- Currently supports: CSV, Excel, SPSS files. Any format that can be loaded in a Pandas DataFrame could be easily added, create an issue on GitHub to request a new format to be added.
- Processed files needs to contain 1 sheet, if multiple sheets are present in a file only the first one will be processed.

> [!WARNING]
>
> Processed files needs to contain 1 sheet, if multiple sheets are present in a file only the first one will be processed.
> The lib does not check yet if the VectorDB has been fully loaded. It will skip loading if there is at least 2 vectors in the DB. So if you stop the loading process halfway through, you will need to delete the VectorDB folder to make sure the lib run the ontology loading.
## 📦️ Installation

Expand Down
4 changes: 2 additions & 2 deletions src/csvw_ontomap/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
@cli.command("profile")
def cli_profile(
files: List[str] = typer.Argument(None, help="Files to profile"),
ontology: str = typer.Option(None, "-m", help="URL to the OWL ontology to map the CSV columns to"),
ontologies: List[str] = typer.Option(None, "-m", help="URLs to the OWL ontologies to map the CSV columns to"),
vectordb: str = typer.Option("data/vectordb", "-d", help="Path to the VectorDB"),
best_matches: int = typer.Option(0, help="Number of best matches to add to each column as rdfs:comment"),
threshold: float = typer.Option(
Expand All @@ -25,7 +25,7 @@ def cli_profile(
comment_best_matches=best_matches,
search_threshold=threshold,
)
profiler = CsvwProfiler(ontology, vectordb, config)
profiler = CsvwProfiler(ontologies, vectordb, config)
report = profiler.profile_files(files)
if output:
if verbose:
Expand Down
47 changes: 26 additions & 21 deletions src/csvw_ontomap/ontology.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Load and search an ontology with a vectordb."""
from typing import Any
from typing import Any, List

from fastembed.embedding import FlagEmbedding as Embedding
from owlready2 import get_ontology
Expand All @@ -18,44 +18,45 @@
COLLECTION_NAME = "csvw-ontomap"


def load_vectordb(ontology_url: str, vectordb_path: str) -> None:
print(f"📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")
onto = get_ontology(ontology_url).load()

def load_vectordb(ontologies: List[str], vectordb_path: str, recreate: bool = False) -> None:
# Initialize FastEmbed and Qdrant Client
print("📥 Loading embedding model")
embedding_model = Embedding(model_name=EMBEDDING_MODEL_NAME, max_length=512)

vectordb = QdrantClient(path=vectordb_path)

# Check if vectordb is already loaded
try:
all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
vectors_count = vectordb.get_collection(COLLECTION_NAME).points_count
print(
f"{all_onto_count} classes/properties in the ontology. And currently {BOLD}{vectors_count}{END} loaded in the VectorDB"
)
if vectors_count <= all_onto_count:
raise Exception("Not enough vectors.")
except Exception as e:
print(f"🔄 {e!s} Recreating VectorDB")
# all_onto_count = len(list(onto.classes())) + len(list(onto.properties()))
# print(f"{all_onto_count} classes/properties in the ontology. And currently {BOLD}{vectors_count}{END} loaded in the VectorDB")
if vectors_count <= 2:
raise Exception("Not enough vectors")
except Exception:
recreate = True
# TODO: for each ontology check if there are more vectors than classes/properties
# And skip building if enough vectors for this ontology
if recreate:
print(f"🔄 Recreating VectorDB in {vectordb_path}")
vectordb.recreate_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=EMBEDDING_MODEL_SIZE, distance=Distance.COSINE),
)
for ontology_url in ontologies:
print(f"📚 Loading ontology from {BOLD}{CYAN}{ontology_url}{END}")
onto = get_ontology(ontology_url).load()
# Find labels, generate embeddings, and upload them
upload_concepts(onto.classes(), "class", ontology_url, vectordb, embedding_model)
upload_concepts(onto.properties(), "property", ontology_url, vectordb, embedding_model)

# Find labels, generate embeddings, and upload them
upload_concepts(onto.classes(), "class", vectordb, embedding_model)
upload_concepts(onto.properties(), "property", vectordb, embedding_model)


def upload_concepts(onto_concepts: Any, category: str, vectordb: Any, embedding_model: Any) -> None:
def upload_concepts(onto_concepts: Any, category: str, ontology_url: str, vectordb: Any, embedding_model: Any) -> None:
"""Generate and upload embeddings for label and description of a list of owlready2 classes/properties"""
print(f"🌀 Generating embeddings for {CYAN}{category}{END}") # ⏳
print(f" Generating embeddings for {category}")
concept_labels = []
concept_uris = []
for concept in onto_concepts:
# print(f"Class URI: {ent.iri}, Label: {ent.label}, Description: {str(ent.description.first())}, Comment: {ent.comment}")
print(concept.label, concept.description, concept.name)
if concept.label:
concept_uris.append(concept.iri)
concept_labels.append(str(concept.label.first()))
Expand All @@ -67,7 +68,11 @@ def upload_concepts(onto_concepts: Any, category: str, vectordb: Any, embedding_
embeddings = list(embedding_model.embed(concept_labels))
points_count: int = vectordb.get_collection(COLLECTION_NAME).points_count
class_points = [
PointStruct(id=points_count + i, vector=embedding, payload={"id": uri, "label": label, "category": category})
PointStruct(
id=points_count + i,
vector=embedding,
payload={"id": uri, "label": label, "category": category, "ontology": ontology_url},
)
for i, (uri, label, embedding) in enumerate(zip(concept_uris, concept_labels, embeddings))
]
print(f"{BOLD}{len(class_points)}{END} vectors generated for {len(list(onto_concepts))} {category}")
Expand Down
13 changes: 7 additions & 6 deletions src/csvw_ontomap/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,18 @@
class CsvwProfiler:
def __init__(
self,
ontology_url: Optional[str] = None,
ontologies: Optional[List[str]] = None,
vectordb_path: str = "data/vectordb",
config: Optional[OntomapConfig] = None,
recreate: bool = False,
) -> None:
"""Optionally provide an ontology that will be loaded to a vectordb for mapping"""
self.config = config if config else OntomapConfig()
self.csvw: Any = CSVW_BASE
self.ontology_url = ontology_url
self.ontologies = ontologies
self.vectordb_path = vectordb_path
if self.ontology_url:
load_vectordb(self.ontology_url, self.vectordb_path)
if self.ontologies:
load_vectordb(self.ontologies, self.vectordb_path, recreate)

def profile_files(self, files: List[str], config: Optional[OntomapConfig] = None) -> Any:
"""Profile a list of tabular files by generating report using https://github.com/ydataai/ydata-profiling
Expand Down Expand Up @@ -61,7 +62,7 @@ def profile_files(self, files: List[str], config: Optional[OntomapConfig] = None
for var_name, var_report in report["variables"].items():
col = {"titles": var_name, "dc:title": separate_words(var_name)}

if self.ontology_url:
if self.ontologies:
# Get most matching property or class from the ontology
matches = search_vectordb(self.vectordb_path, col["dc:title"], config.comment_best_matches)
if matches[0].score >= config.search_threshold:
Expand Down Expand Up @@ -170,6 +171,6 @@ def separate_words(input_string: str) -> str:
CSVW_BASE = {
"@context": ["http://www.w3.org/ns/csvw", {"@language": "en"}],
# "dc:title": "CSVW profiling report",
"dialect": {"header": True, "encoding": "utf-8"},
# "dialect": {"header": True, "encoding": "utf-8"},
"tables": [],
}
11 changes: 7 additions & 4 deletions tests/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@

from csvw_ontomap import CsvwProfiler, OntomapConfig, __version__

ONTOLOGY_URL = "https://semanticscience.org/ontology/sio.owl"
# ONTOLOGY_URL= "http://www.lesfleursdunormal.fr/static/_downloads/omop_cdm_v6.owl"
ONTOLOGIES = [
# "https://semanticscience.org/ontology/sio.owl",
# "http://www.lesfleursdunormal.fr/static/_downloads/omop_cdm_v6.owl",
"data/LOINC.ttl",
]


def test_profiler():
Expand All @@ -18,14 +21,14 @@ def test_profiler():

def test_profiler_with_ontology():
"""Test the Profiler with ontology"""
profiler = CsvwProfiler(ONTOLOGY_URL)
profiler = CsvwProfiler(ONTOLOGIES)
csvw_report = profiler.profile_files(["tests/resources/heart.csv"])
validate_csvw(csvw_report)


def test_profiler_with_ontology_best_matches():
"""Test the Profiler with ontology and add best matches"""
profiler = CsvwProfiler(ONTOLOGY_URL, config=OntomapConfig(comment_best_matches=3))
profiler = CsvwProfiler(ONTOLOGIES, config=OntomapConfig(comment_best_matches=3))
csvw_report = profiler.profile_files(["tests/resources/heart.csv"])
validate_csvw(csvw_report)

Expand Down

0 comments on commit 7224490

Please sign in to comment.