Skip to content

Commit

Permalink
improve script for generating genes
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Nov 5, 2024
1 parent 7a66f17 commit 31bc884
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 33 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ cpu = [
"fastembed",
]
gpu = [
"onnxruntime-gpu",
# "onnxruntime-gpu",
"fastembed-gpu",
]

Expand Down
53 changes: 27 additions & 26 deletions src/sparql_llm/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from sparql_llm.utils import get_prefixes_for_endpoints, query_sparql

def get_embedding_model() -> TextEmbedding:
# return TextEmbedding(settings.embedding_model, cuda=True)
return TextEmbedding(settings.embedding_model)


Expand Down Expand Up @@ -204,32 +205,32 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None:
)
print(f"Done generating and indexing {len(docs)} documents into the vectordb in {time.time() - start_time} seconds")

docs = []
# TODO: Add entities list to the vectordb
for entity in entities_list.values():
res = query_sparql(entity["query"], entity["endpoint"])
for entity_res in res["results"]["bindings"]:
docs.append(
Document(
page_content=entity_res["label"],
metadata={
"label": entity_res["label"],
"uri": entity_res["uri"],
"endpoint_url": entity["endpoint"],
"entity_type": entity["uri"],
},
)
)
print(f"Generating embeddings for {len(docs)} entities")
vectordb.upsert(
collection_name="entities",
points=models.Batch(
ids=list(range(1, len(docs) + 1)),
vectors=embeddings,
payloads=[doc.metadata for doc in docs],
),
# wait=False, # Waiting for indexing to finish or not
)
# docs = []
# # TODO: Add entities list to the vectordb
# for entity in entities_list.values():
# res = query_sparql(entity["query"], entity["endpoint"])
# for entity_res in res["results"]["bindings"]:
# docs.append(
# Document(
# page_content=entity_res["label"],
# metadata={
# "label": entity_res["label"],
# "uri": entity_res["uri"],
# "endpoint_url": entity["endpoint"],
# "entity_type": entity["uri"],
# },
# )
# )
# print(f"Generating embeddings for {len(docs)} entities")
# vectordb.upsert(
# collection_name="entities",
# points=models.Batch(
# ids=list(range(1, len(docs) + 1)),
# vectors=embeddings,
# payloads=[doc.metadata for doc in docs],
# ),
# # wait=False, # Waiting for indexing to finish or not
# )

if __name__ == "__main__":
init_vectordb()
Expand Down
111 changes: 105 additions & 6 deletions src/sparql_llm/embed_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@

entities_list = {
"genex:AnatomicalEntity": {
"label": "Anatomical entity",
"uri": "http://purl.org/genex#AnatomicalEntity",
"label": "Anatomical entity",
"description": "An anatomical entity can be an organism part (e.g. brain, blood, liver and so on) or a material anatomical entity such as a cell.",
"endpoint": "https://www.bgee.org/sparql/",
"query": """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
Expand All @@ -23,10 +23,95 @@
}"""
},
"bgee_species": {
"label": "Anatomical entity",
"uri": "http://purl.uniprot.org/core/Species",
"description": "An anatomical entity can be an organism part (e.g. brain, blood, liver and so on) or a material anatomical entity such as a cell.",
"uri": "http://purl.uniprot.org/core/Taxon",
"label": "species",
"description": "species scientific names",
"endpoint": "https://www.bgee.org/sparql/",
"query": """PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?uri ?label
WHERE {
?uri a up:Taxon ;
up:rank up:Species ;
up:scientificName ?label .
}"""
},
"efo:EFO_0000399": {
"uri": "http://www.ebi.ac.uk/efo/EFO_0000399",
"label": "developmental stage",
"description": "A developmental stage is spatiotemporal region encompassing some part of the life cycle of an organism, e.g. blastula stage.",
"endpoint": "https://www.bgee.org/sparql/",
"query": """PREFIX genex: <http://purl.org/genex#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
?uri a <http://www.ebi.ac.uk/efo/EFO_0000399> .
?uri rdfs:label ?label .}"""
},
"bgee_gene": {
"uri": "http://purl.org/net/orth#Gene",
"label": "Gene",
"description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
"endpoint": "https://www.bgee.org/sparql/",
"query": """PREFIX orth: <http://purl.org/net/orth#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
?uri a orth:Gene .
?uri rdfs:label ?label .}"""
},
"oma_protein": {
"uri": "http://purl.org/net/orth#Protein",
"label": "Protein",
"description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
"endpoint": "https://sparql.omabrowser.org/sparql/",
"query": """PREFIX dc: <http://purl.org/dc/terms/>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
?uri a orth:Protein .
{?uri rdfs:label ?label .}
UNION {
?uri dc:identifier ?label .}
}"""
},
"oma_gene": {
"uri": "http://purl.org/net/orth#Gene",
"label": "Gene",
"description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
"endpoint": "https://sparql.omabrowser.org/sparql/",
"query": """PREFIX orth: <http://purl.org/net/orth#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
?uri a orth:Protein .
?uri rdfs:label ?label .}"""
},
"uniprot_gene": {
"uri": "http://purl.uniprot.org/core/Gene",
"label": "Gene",
"description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
"endpoint": "https://sparql.uniprot.org/sparql/",
"query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
?uri a up:Gene .
?uri skos:prefLabel ?label .}"""
},
"uniprot_protein": {
"uri": "http://purl.uniprot.org/core/Protein",
"label": "Protein",
"description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
"endpoint": "https://sparql.uniprot.org/sparql/",
"query": """PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
?uri a up:Protein .
?uri rdfs:label ?label .}"""
},
"uniprot_species": {
"uri": "http://purl.uniprot.org/core/Taxon",
"label": "species",
"description": "species scientific names",
"endpoint": "https://sparql.uniprot.org/sparql/",
"query": """PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?uri ?label
WHERE {
Expand All @@ -35,12 +120,26 @@
up:scientificName ?label .
}"""
},
"oma_species": {
"uri": "http://purl.uniprot.org/core/Taxon",
"label": "species",
"description": "species scientific names",
"endpoint": "https://sparql.omabrowser.org/sparql/",
"query": """PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?uri ?label
WHERE {
?uri a up:Taxon ;
up:rank up:Species ;
up:scientificName ?label .
}"""
}
}

docs: list[Document] = []
for entity in entities_list.values():
res = query_sparql(entity["query"], entity["endpoint"])
for entity_res in res["results"]["bindings"]:
entities_res = query_sparql(entity["query"], entity["endpoint"])["results"]["bindings"]
print(f"Found {len(entities_res)} entities for {entity['label']} in {entity['endpoint']}")
for entity_res in entities_res:
docs.append(
Document(
page_content=entity_res["label"]["value"],
Expand Down

0 comments on commit 31bc884

Please sign in to comment.