diff --git a/pyproject.toml b/pyproject.toml index 76deaca..9e109eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ cpu = [ "fastembed", ] gpu = [ - "onnxruntime-gpu", + # "onnxruntime-gpu", "fastembed-gpu", ] diff --git a/src/sparql_llm/embed.py b/src/sparql_llm/embed.py index acfb2a4..69c434a 100644 --- a/src/sparql_llm/embed.py +++ b/src/sparql_llm/embed.py @@ -18,6 +18,7 @@ from sparql_llm.utils import get_prefixes_for_endpoints, query_sparql def get_embedding_model() -> TextEmbedding: + # return TextEmbedding(settings.embedding_model, cuda=True) return TextEmbedding(settings.embedding_model) @@ -204,32 +205,32 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None: ) print(f"Done generating and indexing {len(docs)} documents into the vectordb in {time.time() - start_time} seconds") - docs = [] - # TODO: Add entities list to the vectordb - for entity in entities_list.values(): - res = query_sparql(entity["query"], entity["endpoint"]) - for entity_res in res["results"]["bindings"]: - docs.append( - Document( - page_content=entity_res["label"], - metadata={ - "label": entity_res["label"], - "uri": entity_res["uri"], - "endpoint_url": entity["endpoint"], - "entity_type": entity["uri"], - }, - ) - ) - print(f"Generating embeddings for {len(docs)} entities") - vectordb.upsert( - collection_name="entities", - points=models.Batch( - ids=list(range(1, len(docs) + 1)), - vectors=embeddings, - payloads=[doc.metadata for doc in docs], - ), - # wait=False, # Waiting for indexing to finish or not - ) + # docs = [] + # # TODO: Add entities list to the vectordb + # for entity in entities_list.values(): + # res = query_sparql(entity["query"], entity["endpoint"]) + # for entity_res in res["results"]["bindings"]: + # docs.append( + # Document( + # page_content=entity_res["label"], + # metadata={ + # "label": entity_res["label"], + # "uri": entity_res["uri"], + # "endpoint_url": entity["endpoint"], + # "entity_type": entity["uri"], + # }, + # ) + # ) + # print(f"Generating embeddings for {len(docs)} entities") + # vectordb.upsert( + # collection_name="entities", + # points=models.Batch( + # ids=list(range(1, len(docs) + 1)), + # vectors=embeddings, + # payloads=[doc.metadata for doc in docs], + # ), + # # wait=False, # Waiting for indexing to finish or not + # ) if __name__ == "__main__": init_vectordb() diff --git a/src/sparql_llm/embed_entities.py b/src/sparql_llm/embed_entities.py index 2f879b2..7b70056 100644 --- a/src/sparql_llm/embed_entities.py +++ b/src/sparql_llm/embed_entities.py @@ -10,8 +10,8 @@ entities_list = { "genex:AnatomicalEntity": { - "label": "Anatomical entity", "uri": "http://purl.org/genex#AnatomicalEntity", + "label": "Anatomical entity", "description": "An anatomical entity can be an organism part (e.g. brain, blood, liver and so on) or a material anatomical entity such as a cell.", "endpoint": "https://www.bgee.org/sparql/", "query": """PREFIX rdfs: @@ -23,10 +23,95 @@ }""" }, "bgee_species": { - "label": "Anatomical entity", - "uri": "http://purl.uniprot.org/core/Species", - "description": "An anatomical entity can be an organism part (e.g. brain, blood, liver and so on) or a material anatomical entity such as a cell.", + "uri": "http://purl.uniprot.org/core/Taxon", + "label": "species", + "description": "species scientific names", + "endpoint": "https://www.bgee.org/sparql/", + "query": """PREFIX up: +SELECT ?uri ?label +WHERE { + ?uri a up:Taxon ; + up:rank up:Species ; + up:scientificName ?label . +}""" + }, + "efo:EFO_0000399": { + "uri": "http://www.ebi.ac.uk/efo/EFO_0000399", + "label": "developmental stage", + "description": "A developmental stage is spatiotemporal region encompassing some part of the life cycle of an organism, e.g. blastula stage.", + "endpoint": "https://www.bgee.org/sparql/", + "query": """PREFIX genex: +PREFIX rdfs: +SELECT DISTINCT ?uri ?label { + ?uri a . + ?uri rdfs:label ?label .}""" + }, + "bgee_gene": { + "uri": "http://purl.org/net/orth#Gene", + "label": "Gene", + "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.", "endpoint": "https://www.bgee.org/sparql/", + "query": """PREFIX orth: +PREFIX rdfs: +SELECT DISTINCT ?uri ?label { + ?uri a orth:Gene . + ?uri rdfs:label ?label .}""" + }, + "oma_protein": { + "uri": "http://purl.org/net/orth#Protein", + "label": "Protein", + "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.", + "endpoint": "https://sparql.omabrowser.org/sparql/", + "query": """PREFIX dc: +PREFIX orth: +PREFIX rdfs: + +SELECT DISTINCT ?uri ?label { + ?uri a orth:Protein . + {?uri rdfs:label ?label .} + UNION { + ?uri dc:identifier ?label .} +}""" + }, + "oma_gene": { + "uri": "http://purl.org/net/orth#Gene", + "label": "Gene", + "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.", + "endpoint": "https://sparql.omabrowser.org/sparql/", + "query": """PREFIX orth: +PREFIX rdfs: +SELECT DISTINCT ?uri ?label { + ?uri a orth:Protein . + ?uri rdfs:label ?label .}""" + }, + "uniprot_gene": { + "uri": "http://purl.uniprot.org/core/Gene", + "label": "Gene", + "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.", + "endpoint": "https://sparql.uniprot.org/sparql/", + "query": """PREFIX skos: +PREFIX up: +PREFIX rdfs: +SELECT DISTINCT ?uri ?label { + ?uri a up:Gene . + ?uri skos:prefLabel ?label .}""" + }, + "uniprot_protein": { + "uri": "http://purl.uniprot.org/core/Protein", + "label": "Protein", + "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.", + "endpoint": "https://sparql.uniprot.org/sparql/", + "query": """PREFIX up: +PREFIX rdfs: +SELECT DISTINCT ?uri ?label { + ?uri a up:Protein . + ?uri rdfs:label ?label .}""" + }, + "uniprot_species": { + "uri": "http://purl.uniprot.org/core/Taxon", + "label": "species", + "description": "species scientific names", + "endpoint": "https://sparql.uniprot.org/sparql/", "query": """PREFIX up: SELECT ?uri ?label WHERE { @@ -35,12 +120,26 @@ up:scientificName ?label . }""" }, + "oma_species": { + "uri": "http://purl.uniprot.org/core/Taxon", + "label": "species", + "description": "species scientific names", + "endpoint": "https://sparql.omabrowser.org/sparql/", + "query": """PREFIX up: +SELECT ?uri ?label +WHERE { + ?uri a up:Taxon ; + up:rank up:Species ; + up:scientificName ?label . +}""" + } } docs: list[Document] = [] for entity in entities_list.values(): - res = query_sparql(entity["query"], entity["endpoint"]) - for entity_res in res["results"]["bindings"]: + entities_res = query_sparql(entity["query"], entity["endpoint"])["results"]["bindings"] + print(f"Found {len(entities_res)} entities for {entity['label']} in {entity['endpoint']}") + for entity_res in entities_res: docs.append( Document( page_content=entity_res["label"]["value"],