diff --git a/notebooks/test_expasy_chat.ipynb b/notebooks/test_expasy_chat.ipynb index 494679a..5eac21d 100644 --- a/notebooks/test_expasy_chat.ipynb +++ b/notebooks/test_expasy_chat.ipynb @@ -294,6 +294,53 @@ "\n", "\n", " # New queries to test:\n", + " # What are the genes expressed in the human brain?\n", + " # FAILS to add filter for human\n", + " # What are the human genes expressed in the brain? WORKS as expected\n", + "\n", + " # Which are the human genes associated with cancer and their orthologs?\n", + " # This one does not work because in the query generated the variable in the UniProt block ?protein does not match the one used in the OMA block, ?humanUniprot...\n", + " # TODO: when we parse the query check there is a link between the two blocks (2 block are using the same variable)\n", + " # https://sibkru.atlassian.net/jira/software/projects/E4/boards/6?selectedIssue=E4-34\n", + "# {\n", + "# \"question\": \"Which are the human genes associated with cancer and their orthologs? Return ?humanGeneName ?orthologUniprot, and limit to 10\",\n", + "# \"endpoint\": \"https://sparql.uniprot.org/sparql/\",\n", + "# \"query\": \"\"\"PREFIX rdfs: \n", + "# PREFIX skos: \n", + "# PREFIX taxon: \n", + "# PREFIX up: \n", + "# PREFIX orth: \n", + "# PREFIX lscr: \n", + "# SELECT DISTINCT ?humanGeneName ?orthologProtein ?orthologUniprot\n", + "# WHERE {\n", + "# # Retrieve human genes associated with cancer from UniProt\n", + "# ?humanUniprot a up:Protein ;\n", + "# up:organism taxon:9606 ;\n", + "# up:encodedBy ?gene ;\n", + "# up:annotation ?annotation .\n", + "# ?annotation a up:Disease_Annotation ;\n", + "# rdfs:comment ?diseaseComment .\n", + "# FILTER(CONTAINS(LCASE(?diseaseComment), \"cancer\"))\n", + "# ?gene skos:prefLabel ?humanGeneName .\n", + "\n", + "# # Find orthologs of these genes using OMA\n", + "# SERVICE {\n", + "# ?cluster a orth:OrthologsCluster ;\n", + "# orth:hasHomologousMember ?node1 ;\n", + "# orth:hasHomologousMember ?node2 .\n", + "# ?node1 orth:hasHomologousMember* ?humanProtein .\n", + "# ?node2 orth:hasHomologousMember* ?orthologProtein .\n", + "# ?humanProtein lscr:xrefUniprot ?humanUniprot .\n", + "# ?orthologProtein lscr:xrefUniprot ?orthologUniprot .\n", + "# FILTER(?node1 != ?node2)\n", + "# } } LIMIT 10\"\"\",\n", + "# },\n", + "\n", + "\n", + " # List human genes that have known orthologs in the rat and are expressed in the brain?\n", + "\n", + "\n", + " # Which are the human genes associated with cancer and their orthologs expressed in the rat brain?\n", " # Find all proteins linked to arachidonate (CHEBI:32395) and their associated pathways\n", " # List all enzymes that have been experimentally validated and are involved in DNA repair\n", " # Find all proteins that have a mutagenesis annotation affecting their active site\n", @@ -383,6 +430,8 @@ " # } LIMIT 20\"\"\",\n", " # },\n", "\n", + " # Which are the human genes associated with lung cancer and their orthologs expressed in the rat brain?\n", + "\n", " # {\n", " # \"question\": \"Which are the human genes associated with cancer (which have cancer in their disease label) and their orthologs expressed in the rat brain? Return the disease label, human gene URI, human gene HGNC symbol, ortholog rat gene URI\",\n", " # \"endpoint\": \"https://sparql.uniprot.org/sparql/\",\n", diff --git a/src/sparql_llm/api.py b/src/sparql_llm/api.py index d53f83c..6acd426 100644 --- a/src/sparql_llm/api.py +++ b/src/sparql_llm/api.py @@ -356,6 +356,7 @@ def chat_ui(request: Request) -> Any: "How can I get the HGNC symbol for the protein P68871?", "What are the rat orthologs of the human TP53?", "Where is expressed the gene ACE2 in human?", + "List the genes in primates orthologous to genes expressed in the fruit fly's eye", # "Say hi", # "Which are the genes, expressed in the rat, corresponding to human genes associated with cancer?", # "What is the gene associated with the protein P68871?", diff --git a/src/sparql_llm/embed.py b/src/sparql_llm/embed.py index e3ab0fc..582894d 100644 --- a/src/sparql_llm/embed.py +++ b/src/sparql_llm/embed.py @@ -58,10 +58,11 @@ def load_schemaorg_description(endpoint: dict[str, str]) -> list[Document]: g = ConjunctiveGraph() for json_ld_tag in json_ld_tags: json_ld_content = json_ld_tag.string + # print(json_ld_content) if json_ld_content: g.parse(data=json_ld_content, format="json-ld") # json_ld_content = json.loads(json_ld_content) - question = f"What are the general metadata about {endpoint['label']} resource? (description, creators, license, dates, version, etc)" + question = f"What are the general metadata about {endpoint['label']} resource? (description, creators, maintainers, license, dates, version, etc)" docs.append( Document( page_content=question,