From 304b8dfe9db192aa60acf17e9f8aa7aa4acc54a2 Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 15 Aug 2024 15:52:56 +0100 Subject: [PATCH 1/4] adding SCV model --- cmat/clinvar_xml_io/clinvar_record.py | 30 ++++++++-------- .../clinvar_submitted_record.py | 35 +++++++++++++++++++ cmat/output_generation/annotated_clinvar.py | 4 +-- .../clinvar-variant-types.py | 4 +-- 4 files changed, 54 insertions(+), 19 deletions(-) create mode 100644 cmat/clinvar_xml_io/clinvar_submitted_record.py diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index 72c28cb4..0f0678f5 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -22,14 +22,14 @@ class ClinVarRecord: # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful. NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'} - def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure): + def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure): """Initialise a ClinVar record object from an RCV XML record.""" - self.rcv = rcv + self.record_xml = record_xml self.xsd_version = xsd_version # Add a list of traits self.trait_set = [] - for trait in find_elements(self.rcv, './TraitSet/Trait'): + for trait in find_elements(self.record_xml, './TraitSet/Trait'): self.trait_set.append(trait_class(trait, self)) # We are currently only processing MeasureSets of type Variant which are included directly in the RCV record. @@ -37,27 +37,27 @@ def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=Cli # * MeasureSet of types "Haplotype", "Phase unknown", or "Distinct chromosomes" # * GenotypeSet, which contains an assertion about a group of variants from different chromosome copies, with # the type of be either a "CompoundHeterozygote" or a "Diplotype" - variant_measure = find_optional_unique_element(self.rcv, './MeasureSet[@Type="Variant"]/Measure') + variant_measure = find_optional_unique_element(self.record_xml, './MeasureSet[@Type="Variant"]/Measure') if not variant_measure: self.measure = None else: self.measure = measure_class(variant_measure, self) - # List of clinical classifications (Germline, Somatic, or Oncogenecity + # List of clinical classifications (Germline, Somatic, or Oncogenecity) self.clinical_classifications = [] if self.xsd_version < 2: # V1 only ever has a single clinical classification / clinical significance self.clinical_classifications.append( - ClinicalClassification(find_mandatory_unique_element(self.rcv, './ClinicalSignificance'), self)) + ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self)) else: - for clin_class in find_elements(self.rcv, './Classifications/*'): + for clin_class in find_elements(self.record_xml, './Classifications/*'): self.clinical_classifications.append(ClinicalClassification(clin_class, self)) def __str__(self): return f'ClinVarRecord object with accession {self.accession}' def write(self, output): - xml_str = minidom.parseString(ElementTree.tostring(self.rcv)).toprettyxml(indent=' ', encoding='utf-8') + xml_str = minidom.parseString(ElementTree.tostring(self.record_xml)).toprettyxml(indent=' ', encoding='utf-8') # version 3.8 adds superfluous root if xml_str.startswith(b'', b'', xml_str) @@ -67,28 +67,28 @@ def write(self, output): @property def accession(self): - return find_mandatory_unique_element(self.rcv, './ClinVarAccession').attrib['Acc'] + return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc'] @property def date(self): """This tracks the latest update date, counting even minor technical updates.""" - return self.rcv.attrib['DateLastUpdated'] + return self.record_xml.attrib['DateLastUpdated'] @property def created_date(self): """This tracks the date the record was first made public on ClinVar.""" - return self.rcv.attrib['DateCreated'] + return self.record_xml.attrib['DateCreated'] @property def mode_of_inheritance(self): """Return a (possibly empty) list of modes of inheritance for a given ClinVar record.""" return sorted({ - elem.text for elem in find_elements(self.rcv, './AttributeSet/Attribute[@Type="ModeOfInheritance"]') + elem.text for elem in find_elements(self.record_xml, './AttributeSet/Attribute[@Type="ModeOfInheritance"]') }) @property def trait_set_type(self): - return find_mandatory_unique_element(self.rcv, './TraitSet').attrib['Type'] + return find_mandatory_unique_element(self.record_xml, './TraitSet').attrib['Type'] @property def traits(self): @@ -106,11 +106,11 @@ def evidence_support_pubmed_refs(self): specific disease. These are the references displayed on the ClinVar website in the "Assertion and evidence details" section at the bottom of the page.""" return [int(elem.text) - for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')] + for elem in find_elements(self.record_xml, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')] @property def allele_origins(self): - return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')} + return {elem.text for elem in find_elements(self.record_xml, './ObservedIn/Sample/Origin')} @property def valid_allele_origins(self): diff --git a/cmat/clinvar_xml_io/clinvar_submitted_record.py b/cmat/clinvar_xml_io/clinvar_submitted_record.py new file mode 100644 index 00000000..7400dcd9 --- /dev/null +++ b/cmat/clinvar_xml_io/clinvar_submitted_record.py @@ -0,0 +1,35 @@ +import logging + +from cmat.clinvar_xml_io import ClinVarRecord +from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class ClinVarSubmittedRecord(ClinVarRecord): + """ + Submitted records (SCVs) are structured similarly to reference records (RCVs), though typically with fewer + annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are added by curators. + However, these attributes are also technically optional in the RCVs so the code inheritance is possible. + + SCVs also contain additional information about the actual submission, which we model in this class. + """ + + def __init__(self, record_xml, xsd_version, reference_record): + super().__init__(record_xml, xsd_version) + # Each SCV is associated with a single RCV + self.reference_record = reference_record + + @property + def submission_date(self): + return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate'] + + @property + def submitter(self): + return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter'] + + @property + def submission_name(self): + # TODO - check whether this is the correct property to filter on + return self.record_xml.attrib['SubmissionName'] diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py index c39d4974..c71450f5 100644 --- a/cmat/output_generation/annotated_clinvar.py +++ b/cmat/output_generation/annotated_clinvar.py @@ -213,8 +213,8 @@ def print_counter(counter): class AnnotatedClinVarRecord(ClinVarRecord): - def __init__(self, rcv, xsd_version): - super().__init__(rcv, xsd_version, trait_class=OntologyMappedClinVarTrait, + def __init__(self, record_xml, xsd_version): + super().__init__(record_xml, xsd_version, trait_class=OntologyMappedClinVarTrait, measure_class=EnsemblAnnotatedClinVarMeasure) diff --git a/data-exploration/clinvar-variant-types/clinvar-variant-types.py b/data-exploration/clinvar-variant-types/clinvar-variant-types.py index d80022fc..4f5547b4 100644 --- a/data-exploration/clinvar-variant-types/clinvar-variant-types.py +++ b/data-exploration/clinvar-variant-types/clinvar-variant-types.py @@ -125,8 +125,8 @@ def rcv_to_link(rcv_id): rcv_id = clinvar_record.accession # RCV can contain either a MeasureSet, or a GenotypeSet. It must not contain both. - measure_sets = clinvar_record.rcv.findall('MeasureSet') - genotype_sets = clinvar_record.rcv.findall('GenotypeSet') + measure_sets = clinvar_record.record_xml.findall('MeasureSet') + genotype_sets = clinvar_record.record_xml.findall('GenotypeSet') if len(measure_sets) == 1 and len(genotype_sets) == 0: # Most common case. RCV directly contains one measure set. measure_set = measure_sets[0] From 2100310e980cba68e0fc8da0b0ceaa17ff872242 Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 16 Aug 2024 15:56:19 +0100 Subject: [PATCH 2/4] refactor, add ClinVarSet and tests --- bin/cmat/VERSION | 2 +- cmat/clinvar_xml_io/clinvar_record.py | 26 +++--- cmat/clinvar_xml_io/clinvar_set.py | 30 +++++++ .../clinvar_submitted_record.py | 36 +++++++-- cmat/clinvar_xml_io/xml_parsing.py | 16 ++-- data-exploration/filter_clinvar_xml.py | 14 +--- tests/clinvar_xml_io/test_clinvar_measure.py | 40 ++++++++++ tests/clinvar_xml_io/test_clinvar_record.py | 42 ++++++++++ .../test_clinvar_submitted_record.py | 58 ++++++++++++++ tests/output_generation/test_clinvar.py | 79 ------------------- 10 files changed, 228 insertions(+), 115 deletions(-) create mode 100644 cmat/clinvar_xml_io/clinvar_set.py create mode 100644 tests/clinvar_xml_io/test_clinvar_measure.py create mode 100644 tests/clinvar_xml_io/test_clinvar_submitted_record.py delete mode 100644 tests/output_generation/test_clinvar.py diff --git a/bin/cmat/VERSION b/bin/cmat/VERSION index acf9bf09..97c61a6d 100644 --- a/bin/cmat/VERSION +++ b/bin/cmat/VERSION @@ -1 +1 @@ -3.2.2 \ No newline at end of file +3.3.0.dev \ No newline at end of file diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index 0f0678f5..db0ac421 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -1,6 +1,7 @@ import logging import re import xml.etree.ElementTree as ElementTree +from functools import cached_property from xml.dom import minidom from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError @@ -43,16 +44,6 @@ def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_cl else: self.measure = measure_class(variant_measure, self) - # List of clinical classifications (Germline, Somatic, or Oncogenecity) - self.clinical_classifications = [] - if self.xsd_version < 2: - # V1 only ever has a single clinical classification / clinical significance - self.clinical_classifications.append( - ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self)) - else: - for clin_class in find_elements(self.record_xml, './Classifications/*'): - self.clinical_classifications.append(ClinicalClassification(clin_class, self)) - def __str__(self): return f'ClinVarRecord object with accession {self.accession}' @@ -70,7 +61,7 @@ def accession(self): return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc'] @property - def date(self): + def last_updated_date(self): """This tracks the latest update date, counting even minor technical updates.""" return self.record_xml.attrib['DateLastUpdated'] @@ -117,6 +108,19 @@ def valid_allele_origins(self): """Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms.""" return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS} + @cached_property + def clinical_classifications(self): + """List of clinical classifications (Germline, Somatic, or Oncogenecity)""" + clinical_classifications = [] + if self.xsd_version < 2: + # V1 only ever has a single clinical classification / clinical significance + clinical_classifications.append( + ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self)) + else: + for clin_class in find_elements(self.record_xml, './Classifications/*'): + clinical_classifications.append(ClinicalClassification(clin_class, self)) + return clinical_classifications + # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord # if there is exactly one ClinicalClassification for the record. # Otherwise these should be taken from the ClinicalClassification objects directly. diff --git a/cmat/clinvar_xml_io/clinvar_set.py b/cmat/clinvar_xml_io/clinvar_set.py new file mode 100644 index 00000000..7f490b13 --- /dev/null +++ b/cmat/clinvar_xml_io/clinvar_set.py @@ -0,0 +1,30 @@ +from cmat.clinvar_xml_io import ClinVarRecord +from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord +from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements + + +class ClinVarSet: + """ + A ClinVarSet groups together a single reference record (RCV) and one or more submitted records (SCVs). + """ + + def __init__(self, cvs_xml, xsd_version): + self.cvs_xml = cvs_xml + + rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion') + self.rcv = ClinVarRecord(rcv_elem, xsd_version) + + scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True) + self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems] + + @property + def id(self): + return self.cvs_xml.attrib['ID'] + + @property + def title(self): + return find_mandatory_unique_element(self.cvs_xml, './Title').text + + @property + def status(self): + return find_mandatory_unique_element(self.cvs_xml, './RecordStatus').text diff --git a/cmat/clinvar_xml_io/clinvar_submitted_record.py b/cmat/clinvar_xml_io/clinvar_submitted_record.py index 7400dcd9..1d0264b6 100644 --- a/cmat/clinvar_xml_io/clinvar_submitted_record.py +++ b/cmat/clinvar_xml_io/clinvar_submitted_record.py @@ -1,4 +1,5 @@ import logging +from functools import cached_property from cmat.clinvar_xml_io import ClinVarRecord from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element @@ -9,9 +10,9 @@ class ClinVarSubmittedRecord(ClinVarRecord): """ - Submitted records (SCVs) are structured similarly to reference records (RCVs), though typically with fewer - annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are added by curators. - However, these attributes are also technically optional in the RCVs so the code inheritance is possible. + Submitted records (SCVs) are structured similarly to reference records (RCVs) with a few exceptions, though they + typically have fewer annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are + added by curators. SCVs also contain additional information about the actual submission, which we model in this class. """ @@ -21,15 +22,40 @@ def __init__(self, record_xml, xsd_version, reference_record): # Each SCV is associated with a single RCV self.reference_record = reference_record + def __str__(self): + return f'ClinVarSubmittedRecord object with accession {self.accession}' + @property def submission_date(self): + """Date of submission or when submission was last revised (for first submission, use created_date).""" return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate'] + @property + def last_updated_date(self): + """Overrides parent definition, in SCV this date is in the accession element""" + return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated'] + + @property + def created_date(self): + """Overrides parent definition, in SCV this date is in the accession element""" + return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated'] + @property def submitter(self): + """Name of the submitting organization.""" return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter'] + @property + def submitter_id(self): + """Numeric identifier associated with the submitting organization.""" + return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['OrgID'] + @property def submission_name(self): - # TODO - check whether this is the correct property to filter on - return self.record_xml.attrib['SubmissionName'] + """Name or identifier associated with the submission. This is optional.""" + return self.record_xml.attrib.get('SubmissionName', None) + + @cached_property + def clinical_classifications(self): + # Submitted record clinical classifications are defined a bit differently than reference records + raise NotImplementedError('Clinical classification parsing not implemented for SCVs') diff --git a/cmat/clinvar_xml_io/xml_parsing.py b/cmat/clinvar_xml_io/xml_parsing.py index 1023e70e..aa12c9ff 100644 --- a/cmat/clinvar_xml_io/xml_parsing.py +++ b/cmat/clinvar_xml_io/xml_parsing.py @@ -30,18 +30,22 @@ def parse_header_attributes(clinvar_xml): def iterate_rcv_from_xml(clinvar_xml): """Iterates through the gzipped ClinVar XML and yields complete records.""" + for cvs in iterate_cvs_from_xml(clinvar_xml): + # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of + # ClinVar. There should only be one such record per ClinVarSet. + rcv = find_mandatory_unique_element(cvs, 'ReferenceClinVarAssertion') + yield rcv + + +def iterate_cvs_from_xml(clinvar_xml): + """Iterates through the gzipped ClinVar XML and yields complete elements.""" with gzip.open(clinvar_xml, 'rt') as fh: for event, elem in ElementTree.iterparse(fh): # Wait until we have built a complete ClinVarSet element if elem.tag != 'ClinVarSet': continue - - # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of - # ClinVar. There should only be one such record per ClinVarSet. - rcv = find_mandatory_unique_element(elem, 'ReferenceClinVarAssertion') - # Return the complete record and then remove the processed element from the tree to save memory - yield rcv + yield elem elem.clear() diff --git a/data-exploration/filter_clinvar_xml.py b/data-exploration/filter_clinvar_xml.py index ad25eb4a..0795a9fb 100644 --- a/data-exploration/filter_clinvar_xml.py +++ b/data-exploration/filter_clinvar_xml.py @@ -4,7 +4,7 @@ import xml.etree.ElementTree as ElementTree from cmat.clinvar_xml_io import ClinVarRecord -from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element +from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, iterate_cvs_from_xml from cmat.output_generation.clinvar_to_evidence_strings import get_consequence_types from cmat.output_generation.consequence_type import process_consequence_type_file @@ -18,18 +18,6 @@ def pprint(x): print(ElementTree.tostring(x, encoding='unicode')) -def iterate_cvs_from_xml(clinvar_xml): - """Similar to iterate_rcv_from_xml in clinvar_xml_utils, but keeps the entire ClinVarSet XML element. - This allows us to construct a valid ClinVar XML for easy future processing.""" - with gzip.open(clinvar_xml, 'rt') as fh: - for event, elem in ElementTree.iterparse(fh): - # Wait until we have built a complete ClinVarSet element - if elem.tag != 'ClinVarSet': - continue - yield elem - elem.clear() - - def filter_xml(input_xml, output_xml, filter_fct, max_num=None): """ Filter input_xml by boolean condition defined by filter_fct and write to output_xml. If max_num is given, will write at most max_num records, otherwise writes all.""" diff --git a/tests/clinvar_xml_io/test_clinvar_measure.py b/tests/clinvar_xml_io/test_clinvar_measure.py new file mode 100644 index 00000000..d0bacda5 --- /dev/null +++ b/tests/clinvar_xml_io/test_clinvar_measure.py @@ -0,0 +1,40 @@ +import os + +from cmat.clinvar_xml_io import ClinVarDataset + +resources_dir = os.path.join(os.path.dirname(__file__), 'resources') + + +class TestClinvarRecordMeasure: + @classmethod + def setup_class(cls): + input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz') + cls.test_crm = next(iter(ClinVarDataset(input_file))).measure + + def test_hgvs(self): + text_hgvs = [h.text for h in self.test_crm.all_hgvs] + assert text_hgvs == ['NM_152443.3:c.677A>G', + 'NG_008321.1:g.32324A>G', + 'NC_000014.9:g.67729209A>G', + 'NC_000014.8:g.68195926A>G', + 'NM_152443.2:c.677A>G', + 'Q96NR8:p.Tyr226Cys', + 'NP_689656.2:p.Tyr226Cys'] + + def test_preferred_current_hgvs(self): + assert self.test_crm.preferred_current_hgvs.text == 'NC_000014.9:g.67729209A>G' + + def test_rs(self): + assert self.test_crm.rs_id == 'rs28940313' + + def test_nsv(self): + assert self.test_crm.nsv_id is None + + def test_variant_type(self): + assert self.test_crm.variant_type == 'single nucleotide variant' + + def test_measure_set_pubmed_refs(self): + assert self.test_crm.pubmed_refs == [] + + def test_so_terms(self): + assert self.test_crm.existing_so_terms == {'SO:0001583'} diff --git a/tests/clinvar_xml_io/test_clinvar_record.py b/tests/clinvar_xml_io/test_clinvar_record.py index f6000c55..a727560d 100644 --- a/tests/clinvar_xml_io/test_clinvar_record.py +++ b/tests/clinvar_xml_io/test_clinvar_record.py @@ -32,3 +32,45 @@ def test_multiple_clinical_classifications_record(): assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'} with pytest.raises(MultipleClinicalClassificationsError): print(record.valid_clinical_significances) + + +class TestClinvarRecord: + @classmethod + def setup_class(cls): + input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz') + cls.test_clinvar_record = next(iter(ClinVarDataset(input_file))) + + def test_date(self): + """Check that the last updated date of the referenceClinVarAssertion is loaded correctly""" + assert self.test_clinvar_record.last_updated_date == '2024-04-15' + + def test_score(self): + assert self.test_clinvar_record.score == 2 + + def test_review_status(self): + assert self.test_clinvar_record.review_status == 'criteria provided, multiple submitters, no conflicts' + + def test_acc(self): + assert self.test_clinvar_record.accession == 'RCV000002127' + + def test_traits(self): + assert self.test_clinvar_record.traits[0].preferred_name == 'Leber congenital amaurosis 13' + assert self.test_clinvar_record.traits[0].preferred_or_other_valid_name == 'Leber congenital amaurosis 13' + + def test_trait_pubmed_refs(self): + assert self.test_clinvar_record.traits[0].pubmed_refs == [20301590, 30285347] + + def test_observed_pubmed_refs(self): + assert self.test_clinvar_record.evidence_support_pubmed_refs == [15258582, 15322982] + + def test_clinical_significance(self): + assert self.test_clinvar_record.clinical_significance_list == ['likely pathogenic', 'pathogenic'] + + def test_allele_origins(self): + assert self.test_clinvar_record.allele_origins == {'germline', 'inherited', 'unknown'} + + def test_valid_allele_origins(self): + assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'} + + def test_trait_efo_ids(self): + assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')] diff --git a/tests/clinvar_xml_io/test_clinvar_submitted_record.py b/tests/clinvar_xml_io/test_clinvar_submitted_record.py new file mode 100644 index 00000000..df07668d --- /dev/null +++ b/tests/clinvar_xml_io/test_clinvar_submitted_record.py @@ -0,0 +1,58 @@ +import os + +import pytest + +from cmat.clinvar_xml_io.clinvar_set import ClinVarSet +from cmat.clinvar_xml_io.xml_parsing import iterate_cvs_from_xml + + +@pytest.fixture +def clinvar_set(): + resources_dir = os.path.join(os.path.dirname(__file__), 'resources') + input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz') + return ClinVarSet(next(iterate_cvs_from_xml(input_file)), 2.0) + + +@pytest.fixture +def submitted_record(clinvar_set): + return clinvar_set.scvs[0] + + +def test_clinvar_set(clinvar_set): + assert clinvar_set.rcv.accession == 'RCV000002127' + assert len(clinvar_set.scvs) == 5 + assert clinvar_set.id == '188870850' + assert clinvar_set.title == 'NM_152443.3(RDH12):c.677A>G (p.Tyr226Cys) AND Leber congenital amaurosis 13' + assert clinvar_set.status == 'current' + + +def test_clinvar_submitted_record(submitted_record): + assert submitted_record.accession == 'SCV000022285' + assert submitted_record.submitter == 'OMIM' + assert submitted_record.valid_allele_origins == {'germline'} + assert submitted_record.evidence_support_pubmed_refs == [15258582, 15322982] + + assert submitted_record.created_date == '2013-04-04' # submission first publicly available + assert submitted_record.submission_date == '2015-07-02' # submission last revised + assert submitted_record.last_updated_date == '2015-07-05' # submission last revision publicly available + + with pytest.raises(NotImplementedError): + assert submitted_record.valid_clinical_significances + + +def test_clinvar_submitted_record_trait(submitted_record): + assert len(submitted_record.traits_with_valid_names) == 1 + scv_trait = submitted_record.traits_with_valid_names[0] + + assert scv_trait.preferred_or_other_valid_name == 'LEBER CONGENITAL AMAUROSIS 13' + assert scv_trait.current_efo_aligned_xrefs == [] + + +def test_clinvar_submitted_record_measure(submitted_record): + assert submitted_record.measure is not None + scv_measure = submitted_record.measure + + assert scv_measure.preferred_or_other_name == 'RDH12, TYR226CYS' + assert scv_measure.preferred_current_hgvs is None + assert not scv_measure.has_complete_coordinates + assert scv_measure.variant_type == 'Variation' diff --git a/tests/output_generation/test_clinvar.py b/tests/output_generation/test_clinvar.py deleted file mode 100644 index 9dcedfbd..00000000 --- a/tests/output_generation/test_clinvar.py +++ /dev/null @@ -1,79 +0,0 @@ -from cmat.output_generation import consequence_type as CT - -import config - - -class TestClinvarRecord: - @classmethod - def setup_class(cls): - cls.test_clinvar_record = config.get_test_clinvar_record() - - def test_date(self): - """Check that the last updated date of the referenceClinVarAssertion is loaded correctly""" - assert self.test_clinvar_record.date == '2024-04-15' - - def test_score(self): - assert self.test_clinvar_record.score == 2 - - def test_review_status(self): - assert self.test_clinvar_record.review_status == 'criteria provided, multiple submitters, no conflicts' - - def test_acc(self): - assert self.test_clinvar_record.accession == 'RCV000002127' - - def test_traits(self): - assert self.test_clinvar_record.traits[0].preferred_name == 'Leber congenital amaurosis 13' - assert self.test_clinvar_record.traits[0].preferred_or_other_valid_name == 'Leber congenital amaurosis 13' - - def test_trait_pubmed_refs(self): - assert self.test_clinvar_record.traits[0].pubmed_refs == [20301590, 30285347] - - def test_observed_pubmed_refs(self): - assert self.test_clinvar_record.evidence_support_pubmed_refs == [15258582, 15322982] - - def test_clinical_significance(self): - assert self.test_clinvar_record.clinical_significance_list == ['likely pathogenic', 'pathogenic'] - - def test_allele_origins(self): - assert self.test_clinvar_record.allele_origins == {'germline', 'inherited', 'unknown'} - - def test_valid_allele_origins(self): - assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'} - - def test_trait_efo_ids(self): - assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')] - - -class TestClinvarRecordMeasure: - @classmethod - def setup_class(cls): - cls.test_crm = config.get_test_clinvar_record().measure - cls.consequence_type_dict = CT.process_consequence_type_file(config.snp_2_gene_file) - - def test_hgvs(self): - text_hgvs = [h.text for h in self.test_crm.all_hgvs] - assert text_hgvs == ['NM_152443.3:c.677A>G', - 'NG_008321.1:g.32324A>G', - 'NC_000014.9:g.67729209A>G', - 'NC_000014.8:g.68195926A>G', - 'NM_152443.2:c.677A>G', - 'Q96NR8:p.Tyr226Cys', - 'NP_689656.2:p.Tyr226Cys'] - - def test_preferred_current_hgvs(self): - assert self.test_crm.preferred_current_hgvs.text == 'NC_000014.9:g.67729209A>G' - - def test_rs(self): - assert self.test_crm.rs_id == 'rs28940313' - - def test_nsv(self): - assert self.test_crm.nsv_id is None - - def test_variant_type(self): - assert self.test_crm.variant_type == 'single nucleotide variant' - - def test_measure_set_pubmed_refs(self): - assert self.test_crm.pubmed_refs == [] - - def test_so_terms(self): - assert self.test_crm.existing_so_terms == {'SO:0001583'} From 0107f0c53b3a936ce45089e0db977aec2cad17d2 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 19 Aug 2024 11:57:37 +0100 Subject: [PATCH 3/4] add submitted record tests --- tests/clinvar_xml_io/test_clinvar_submitted_record.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/clinvar_xml_io/test_clinvar_submitted_record.py b/tests/clinvar_xml_io/test_clinvar_submitted_record.py index df07668d..7a7be2b5 100644 --- a/tests/clinvar_xml_io/test_clinvar_submitted_record.py +++ b/tests/clinvar_xml_io/test_clinvar_submitted_record.py @@ -27,8 +27,10 @@ def test_clinvar_set(clinvar_set): def test_clinvar_submitted_record(submitted_record): - assert submitted_record.accession == 'SCV000022285' assert submitted_record.submitter == 'OMIM' + assert submitted_record.submitter_id == '3' + assert submitted_record.submission_name is None + assert submitted_record.accession == 'SCV000022285' assert submitted_record.valid_allele_origins == {'germline'} assert submitted_record.evidence_support_pubmed_refs == [15258582, 15322982] From a7870e69a506b2618387db6e2530301ae2f619cb Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 21 Aug 2024 13:54:47 +0100 Subject: [PATCH 4/4] add reference record class --- cmat/clinvar_xml_io/clinvar_dataset.py | 4 +- cmat/clinvar_xml_io/clinvar_record.py | 30 ++++++-------- .../clinvar_reference_record.py | 41 +++++++++++++++++++ .../clinvar_submitted_record.py | 2 - .../drug-response-background-trait.ipynb | 2 +- tests/clinvar_xml_io/test_clinvar_record.py | 2 + 6 files changed, 58 insertions(+), 23 deletions(-) create mode 100644 cmat/clinvar_xml_io/clinvar_reference_record.py diff --git a/cmat/clinvar_xml_io/clinvar_dataset.py b/cmat/clinvar_xml_io/clinvar_dataset.py index b436f11a..a896cbdd 100644 --- a/cmat/clinvar_xml_io/clinvar_dataset.py +++ b/cmat/clinvar_xml_io/clinvar_dataset.py @@ -3,7 +3,7 @@ import re from datetime import date -from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord +from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ def __init__(self, clinvar_xml): def __iter__(self): for rcv in iterate_rcv_from_xml(self.clinvar_xml): - yield ClinVarRecord(rcv, self.xsd_version) + yield ClinVarReferenceRecord(rcv, self.xsd_version) def get_xsd_version(self): # For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index db0ac421..18635539 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -4,7 +4,7 @@ from functools import cached_property from xml.dom import minidom -from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError +from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \ @@ -15,10 +15,10 @@ class ClinVarRecord: - """Instances of this class hold data on individual ClinVar records. See also: - * /data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model; - * Issue https://github.com/EBIvariation/eva-opentargets/issues/127 for the most recent discussions on changing - support of different ClinVar record types.""" + """ + Base class for both reference and submitted records in ClinVar. See also: + /data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model + """ # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful. NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'} @@ -62,13 +62,15 @@ def accession(self): @property def last_updated_date(self): - """This tracks the latest update date, counting even minor technical updates.""" - return self.record_xml.attrib['DateLastUpdated'] + """This tracks the latest update date, counting even minor technical updates. + Appears differently in reference and submitted records.""" + raise NotImplementedError @property def created_date(self): - """This tracks the date the record was first made public on ClinVar.""" - return self.record_xml.attrib['DateCreated'] + """This tracks the date the record was first made public on ClinVar. + Appears differently in reference and submitted records.""" + raise NotImplementedError @property def mode_of_inheritance(self): @@ -111,15 +113,7 @@ def valid_allele_origins(self): @cached_property def clinical_classifications(self): """List of clinical classifications (Germline, Somatic, or Oncogenecity)""" - clinical_classifications = [] - if self.xsd_version < 2: - # V1 only ever has a single clinical classification / clinical significance - clinical_classifications.append( - ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self)) - else: - for clin_class in find_elements(self.record_xml, './Classifications/*'): - clinical_classifications.append(ClinicalClassification(clin_class, self)) - return clinical_classifications + raise NotImplementedError # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord # if there is exactly one ClinicalClassification for the record. diff --git a/cmat/clinvar_xml_io/clinvar_reference_record.py b/cmat/clinvar_xml_io/clinvar_reference_record.py new file mode 100644 index 00000000..90bd4f4e --- /dev/null +++ b/cmat/clinvar_xml_io/clinvar_reference_record.py @@ -0,0 +1,41 @@ +import logging +from functools import cached_property + +from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification + +from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord +from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class ClinVarReferenceRecord(ClinVarRecord): + """Reference records (RCVs) summarise information from submitted records (SCVs) and include additional annotations + and cross-references supplied by ClinVar.""" + + def __init__(self, record_xml, xsd_version): + super().__init__(record_xml, xsd_version) + + def __str__(self): + return f'ClinVarReferenceRecord object with accession {self.accession}' + + @property + def last_updated_date(self): + return self.record_xml.attrib['DateLastUpdated'] + + @property + def created_date(self): + return self.record_xml.attrib['DateCreated'] + + @cached_property + def clinical_classifications(self): + clinical_classifications = [] + if self.xsd_version < 2: + # V1 only ever has a single clinical classification / clinical significance + clinical_classifications.append( + ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self)) + else: + for clin_class in find_elements(self.record_xml, './Classifications/*'): + clinical_classifications.append(ClinicalClassification(clin_class, self)) + return clinical_classifications diff --git a/cmat/clinvar_xml_io/clinvar_submitted_record.py b/cmat/clinvar_xml_io/clinvar_submitted_record.py index 1d0264b6..7934cea3 100644 --- a/cmat/clinvar_xml_io/clinvar_submitted_record.py +++ b/cmat/clinvar_xml_io/clinvar_submitted_record.py @@ -32,12 +32,10 @@ def submission_date(self): @property def last_updated_date(self): - """Overrides parent definition, in SCV this date is in the accession element""" return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated'] @property def created_date(self): - """Overrides parent definition, in SCV this date is in the accession element""" return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated'] @property diff --git a/data-exploration/drug-response/drug-response-background-trait.ipynb b/data-exploration/drug-response/drug-response-background-trait.ipynb index ca9aab01..8db5a3bb 100644 --- a/data-exploration/drug-response/drug-response-background-trait.ipynb +++ b/data-exploration/drug-response/drug-response-background-trait.ipynb @@ -3087,4 +3087,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/tests/clinvar_xml_io/test_clinvar_record.py b/tests/clinvar_xml_io/test_clinvar_record.py index a727560d..282dfda7 100644 --- a/tests/clinvar_xml_io/test_clinvar_record.py +++ b/tests/clinvar_xml_io/test_clinvar_record.py @@ -35,6 +35,8 @@ def test_multiple_clinical_classifications_record(): class TestClinvarRecord: + """Tests base class as well as reference record""" + @classmethod def setup_class(cls): input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')