From e9a2a4b771a01c9f30fda2b53535fd1a2303f478 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 17 Jan 2024 11:17:21 +0000 Subject: [PATCH 1/4] modify criteria for finished Zooma mappings and add test --- cmat/trait_mapping/main.py | 2 +- cmat/trait_mapping/trait.py | 11 +++++------ tests/trait_mapping/test_main.py | 27 ++++++++++++++++++++++++++- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/cmat/trait_mapping/main.py b/cmat/trait_mapping/main.py index a06dab44..b35e0e7f 100644 --- a/cmat/trait_mapping/main.py +++ b/cmat/trait_mapping/main.py @@ -47,7 +47,7 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: """ logger.debug('Processing trait {}'.format(trait.name)) - trait.zooma_result_list = get_zooma_results(trait.name, filters, zooma_host, target_ontology) + trait.zooma_result_list = get_zooma_results(trait.name.lower(), filters, zooma_host, target_ontology) trait.process_zooma_results() if (trait.is_finished or len(trait.zooma_result_list) == 0 diff --git a/cmat/trait_mapping/trait.py b/cmat/trait_mapping/trait.py index f28fd664..3476f41b 100644 --- a/cmat/trait_mapping/trait.py +++ b/cmat/trait_mapping/trait.py @@ -58,12 +58,11 @@ def process_zooma_results(self): Check whether any Zooma mappings can be output as a finished ontology mapping. Put any finished mappings in finished_mapping_set """ - for mapping in self.zooma_result_list: - if mapping.confidence.lower() != "high": - continue - - for mapping in mapping.mapping_list: - if mapping.in_ontology and mapping.is_current: + for zooma_result in self.zooma_result_list: + for mapping in zooma_result.mapping_list: + # Accept either high-confidence mappings, or exact string matches + if mapping.in_ontology and mapping.is_current and (zooma_result.confidence.lower() == "high" + or zooma_result.zooma_label.lower() == self.name.lower()): ontology_entry = OntologyEntry(mapping.uri, mapping.ontology_label) self.finished_mapping_set.add(ontology_entry) diff --git a/tests/trait_mapping/test_main.py b/tests/trait_mapping/test_main.py index 29537b3c..6c6a3dc6 100644 --- a/tests/trait_mapping/test_main.py +++ b/tests/trait_mapping/test_main.py @@ -6,7 +6,8 @@ import pytest -from cmat.trait_mapping.main import parse_traits, process_traits +from cmat.trait_mapping.main import parse_traits, process_traits, process_trait +from cmat.trait_mapping.trait import Trait def get_test_resource(resource_name): @@ -62,3 +63,27 @@ def test_main(): mapped_terms = {x[0] for x in output_mappings} curation_terms = {x[0] for x in output_curation} assert len(mapped_terms) + len(curation_terms) == len(all_terms) + + +def test_process_trait_exact_match(): + # Exact match with MONDO:0009061 (in EFO and Mondo) + trait_name = 'Cystic Fibrosis' + # Don't use any data sources in Zooma as those will come back as high-confidence matches + zooma_filters = {'ontologies': 'efo,mondo,hp', + 'required': 'none', + 'preferred': 'none'} + zooma_host = 'https://www.ebi.ac.uk' + # Don't use OxO + oxo_targets = [] + oxo_distance = 0 + + # This should be marked as finished, as it's an exact string match with a term contained in the target ontology + efo_trait = process_trait(Trait(trait_name, None, None), zooma_filters, zooma_host, oxo_targets, oxo_distance, + target_ontology='efo') + assert efo_trait.is_finished + + # This should not be marked as finished, even though Zooma finds an exact match in one of its ontologies, it's not + # the requested target ontology and thus still needs to be curated + hpo_trait = process_trait(Trait(trait_name, None, None), zooma_filters, zooma_host, oxo_targets, oxo_distance, + target_ontology='hp') + assert not hpo_trait.is_finished From ae0a233c9385fc1f3267a030399ceb50378f13a9 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 17 Jan 2024 12:52:03 +0000 Subject: [PATCH 2/4] update for Ensembl 111 --- tests/pipelines/resources/expected/consequences_snp.tsv | 3 ++- tests/pipelines/resources/expected/evidence_strings.json | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/resources/expected/consequences_snp.tsv b/tests/pipelines/resources/expected/consequences_snp.tsv index 2c52ad6d..885c7542 100644 --- a/tests/pipelines/resources/expected/consequences_snp.tsv +++ b/tests/pipelines/resources/expected/consequences_snp.tsv @@ -411,6 +411,7 @@ 17:65558594:G:A ENSG00000168646 AXIN2 synonymous_variant 17:6556374:C:T ENSG00000091622 PITPNM3 intron_variant 17:6628328:A:G ENSG00000198920 KIAA0753 synonymous_variant +17:6628328:A:G ENSG00000282936 3_prime_UTR_variant 17:6693178:A:AACACACACACAC ENSG00000141485 SLC13A5 splice_polypyrimidine_tract_variant 17:7224245:T:C ENSG00000072778 ACADVL splice_donor_variant 17:73201307:A:G ENSG00000166685 COG1 missense_variant @@ -900,7 +901,7 @@ 4:120785105:A:G ENSG00000138738 PRDM5 splice_polypyrimidine_tract_variant 4:121847458:G:T ENSG00000138686 BBS7 missense_variant 4:121853086:C:T ENSG00000138686 BBS7 missense_variant -4:122979387:G:T ENSG00000145375 SPATA5 splice_donor_variant +4:122979387:G:T ENSG00000145375 AFG2A splice_donor_variant 4:127881908:G:T ENSG00000142731 PLK4 missense_variant 4:127930737:G:T ENSG00000164073 MFSD8 missense_variant 4:128959797:C:A ENSG00000151466 SCLT1 intron_variant diff --git a/tests/pipelines/resources/expected/evidence_strings.json b/tests/pipelines/resources/expected/evidence_strings.json index 91a28d5a..8c14aacf 100644 --- a/tests/pipelines/resources/expected/evidence_strings.json +++ b/tests/pipelines/resources/expected/evidence_strings.json @@ -788,6 +788,7 @@ {"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["uncertain significance"], "confidence": "criteria provided, single submitter", "studyId": "RCV001373139", "releaseDate": "2021-04-13", "targetFromSourceId": "ENSG00000163930", "variantFunctionalConsequenceId": "SO_0001583", "variantId": "3_52402628_G_A", "cohortPhenotypes": ["BAP1 tumor predisposition syndrome", "BAP1-related tumor predisposition syndrome", "Tumor predisposition syndrome", "Tumor susceptibility linked to germline BAP1 mutations"], "diseaseFromSource": "BAP1-related tumor predisposition syndrome", "diseaseFromSourceId": "C3280492", "diseaseFromSourceMappedId": "MONDO_0013692", "variantHgvsId": "NC_000003.12:g.52402628G>A"} {"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["uncertain significance"], "confidence": "criteria provided, single submitter", "studyId": "RCV001373139", "releaseDate": "2021-04-13", "targetFromSourceId": "ENSG00000163930", "variantFunctionalConsequenceId": "SO_0001583", "variantId": "3_52402628_G_A", "cohortPhenotypes": ["BAP1 tumor predisposition syndrome", "BAP1-related tumor predisposition syndrome", "Tumor predisposition syndrome", "Tumor susceptibility linked to germline BAP1 mutations"], "diseaseFromSource": "BAP1-related tumor predisposition syndrome", "diseaseFromSourceId": "C3280492", "diseaseFromSourceMappedId": "Orphanet_289539", "variantHgvsId": "NC_000003.12:g.52402628G>A"} {"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["benign"], "confidence": "criteria provided, single submitter", "studyId": "RCV001730858", "releaseDate": "2022-02-20", "targetFromSourceId": "ENSG00000198920", "variantFunctionalConsequenceId": "SO_0001819", "variantId": "17_6628328_A_G", "cohortPhenotypes": ["Joubert syndrome 38"], "diseaseFromSource": "Joubert syndrome 38", "diseaseFromSourceId": "C5561958", "variantHgvsId": "NC_000017.11:g.6628328A>G"} +{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["benign"], "confidence": "criteria provided, single submitter", "studyId": "RCV001730858", "releaseDate": "2022-02-20", "targetFromSourceId": "ENSG00000282936", "variantFunctionalConsequenceId": "SO_0001624", "variantId": "17_6628328_A_G", "cohortPhenotypes": ["Joubert syndrome 38"], "diseaseFromSource": "Joubert syndrome 38", "diseaseFromSourceId": "C5561958", "variantHgvsId": "NC_000017.11:g.6628328A>G"} {"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["likely benign"], "confidence": "criteria provided, single submitter", "studyId": "RCV002057147", "releaseDate": "2022-06-09", "targetFromSourceId": "ENSG00000115904", "variantFunctionalConsequenceId": "SO_0001627", "variantId": "2_39120324_C_A", "variantRsId": "rs368569135", "cohortPhenotypes": ["Noonan spectrum disorder", "RASopathy", "rasopathies"], "diseaseFromSource": "RASopathy", "diseaseFromSourceId": "C5555857", "diseaseFromSourceMappedId": "EFO_1001502", "variantHgvsId": "NC_000002.12:g.39120324C>A"} {"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["pathogenic"], "confidence": "criteria provided, single submitter", "studyId": "RCV002247286", "releaseDate": "2022-06-09", "targetFromSourceId": "ENSG00000198712", "variantFunctionalConsequenceId": "SO_0001631", "variantId": "MT_7512_T_C", "variantRsId": "rs199474817", "cohortPhenotypes": ["COX deficiency", "Complex 4 mitochondrial respiratory chain deficiency", "Complex IV deficiency", "Cytochrome-c oxidase deficiency", "Cytochrome-c oxidase deficiency disease", "Deficiency of mitochondrial respiratory chain complex4", "MITOCHONDRIAL COMPLEX IV DEFICIENCY, NUCLEAR TYPE 1", "Mitochondrial complex IV deficiency"], "diseaseFromSource": "Cytochrome-c oxidase deficiency disease", "diseaseFromSourceId": "C5435656", "diseaseFromSourceMappedId": "MONDO_0009068", "variantHgvsId": "NC_012920.1:m.7512T>C"} {"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["pathogenic"], "confidence": "criteria provided, single submitter", "studyId": "RCV002247286", "releaseDate": "2022-06-09", "targetFromSourceId": "ENSG00000198786", "variantFunctionalConsequenceId": "SO_0001631", "variantId": "MT_7512_T_C", "variantRsId": "rs199474817", "cohortPhenotypes": ["COX deficiency", "Complex 4 mitochondrial respiratory chain deficiency", "Complex IV deficiency", "Cytochrome-c oxidase deficiency", "Cytochrome-c oxidase deficiency disease", "Deficiency of mitochondrial respiratory chain complex4", "MITOCHONDRIAL COMPLEX IV DEFICIENCY, NUCLEAR TYPE 1", "Mitochondrial complex IV deficiency"], "diseaseFromSource": "Cytochrome-c oxidase deficiency disease", "diseaseFromSourceId": "C5435656", "diseaseFromSourceMappedId": "MONDO_0009068", "variantHgvsId": "NC_012920.1:m.7512T>C"} From 869367190bce8dcec934b4300ee38fd93eeb2137 Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 19 Jan 2024 11:01:28 +0000 Subject: [PATCH 3/4] address review comments --- cmat/trait_mapping/trait.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmat/trait_mapping/trait.py b/cmat/trait_mapping/trait.py index 3476f41b..76113f6a 100644 --- a/cmat/trait_mapping/trait.py +++ b/cmat/trait_mapping/trait.py @@ -60,7 +60,7 @@ def process_zooma_results(self): """ for zooma_result in self.zooma_result_list: for mapping in zooma_result.mapping_list: - # Accept either high-confidence mappings, or exact string matches + # Accept current mappings in the target ontology with either high-confidence or exact string matches if mapping.in_ontology and mapping.is_current and (zooma_result.confidence.lower() == "high" or zooma_result.zooma_label.lower() == self.name.lower()): ontology_entry = OntologyEntry(mapping.uri, mapping.ontology_label) From 6dcd166ab36b551b22cd5a052f25f86670d78433 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 22 Jan 2024 14:52:26 +0000 Subject: [PATCH 4/4] add explicit OLS search before querying Zooma --- cmat/trait_mapping/main.py | 13 ++++++++++--- cmat/trait_mapping/ols.py | 29 +++++++++++++++++++++++++++++ tests/trait_mapping/test_main.py | 8 ++++---- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/cmat/trait_mapping/main.py b/cmat/trait_mapping/main.py index b35e0e7f..6f859343 100644 --- a/cmat/trait_mapping/main.py +++ b/cmat/trait_mapping/main.py @@ -4,10 +4,11 @@ from collections import Counter from cmat.clinvar_xml_io import ClinVarTrait +from cmat.trait_mapping.ols import get_uri_from_exact_match from cmat.trait_mapping.output import output_trait from cmat.trait_mapping.oxo import get_oxo_results from cmat.trait_mapping.oxo import uris_to_oxo_format -from cmat.trait_mapping.trait import Trait +from cmat.trait_mapping.trait import Trait, OntologyEntry from cmat.trait_mapping.trait_names_parsing import parse_trait_names from cmat.trait_mapping.zooma import get_zooma_results @@ -32,8 +33,9 @@ def get_uris_for_oxo(zooma_result_list: list) -> set: def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list, oxo_distance: int, target_ontology: str = 'EFO') -> Trait: """ - Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma - mappings that are in EFO then query OxO with any high confidence mappings not in EFO. + Process a single trait. First look for an exact string match in the target ontology and return immediately if found. + Otherwise find any mappings in Zooma. If there are no high confidence Zooma mappings that are in EFO then query OxO + with any high confidence mappings not in EFO. :param trait: The trait to be processed. :param filters: A dictionary of filters to use for querying Zooma. @@ -47,6 +49,11 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: """ logger.debug('Processing trait {}'.format(trait.name)) + string_match_uri = get_uri_from_exact_match(trait.name.lower(), target_ontology) + if string_match_uri: + trait.finished_mapping_set.add(OntologyEntry(string_match_uri, trait.name.lower())) + return trait + trait.zooma_result_list = get_zooma_results(trait.name.lower(), filters, zooma_host, target_ontology) trait.process_zooma_results() if (trait.is_finished diff --git a/cmat/trait_mapping/ols.py b/cmat/trait_mapping/ols.py index 3acfdfee..f746867b 100644 --- a/cmat/trait_mapping/ols.py +++ b/cmat/trait_mapping/ols.py @@ -1,3 +1,4 @@ +import os from functools import lru_cache import logging import requests @@ -121,3 +122,31 @@ def get_replacement_term(uri: str, ontology: str = 'EFO') -> str: if response_json["term_replaced_by"] is not None: return response_json["term_replaced_by"] return "" + + +@lru_cache(maxsize=16384) +@retry(exceptions=(ConnectionError, requests.RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3)) +def get_uri_from_exact_match(text, ontology='EFO'): + """ + Finds URI from target ontology for a given text based on exact string match. + + :param text: String to search for + :param ontology: ID of target ontology to query (default EFO) + :return: URI of matching term or None if not found + """ + search_url = os.path.join(OLS_SERVER, f'api/search?ontology={ontology}&q={text}&queryFields=label&exact=true') + response = requests.get(search_url) + response.raise_for_status() + data = response.json() + if 'response' in data: + results = data['response']['docs'] + candidates = set() + for result in results: + # Check that we've found the term exactly (strict case-insensitive string match) + if result['label'].lower() == text.lower(): + candidates.add(result['iri']) + # Only return a result if we can find it unambiguously + if len(candidates) == 1: + return candidates.pop() + logger.warning(f'Could not find an IRI for {text}') + return None diff --git a/tests/trait_mapping/test_main.py b/tests/trait_mapping/test_main.py index 6c6a3dc6..a9293b6b 100644 --- a/tests/trait_mapping/test_main.py +++ b/tests/trait_mapping/test_main.py @@ -68,10 +68,10 @@ def test_main(): def test_process_trait_exact_match(): # Exact match with MONDO:0009061 (in EFO and Mondo) trait_name = 'Cystic Fibrosis' - # Don't use any data sources in Zooma as those will come back as high-confidence matches - zooma_filters = {'ontologies': 'efo,mondo,hp', - 'required': 'none', - 'preferred': 'none'} + # Use our default Zooma filters + zooma_filters = {'ontologies': 'efo,ordo,hp,mondo', + 'required': 'cttv,eva-clinvar,clinvar-xrefs,gwas', + 'preferred': 'eva-clinvar,cttv,gwas,clinvar-xrefs'} zooma_host = 'https://www.ebi.ac.uk' # Don't use OxO oxo_targets = []