EBIvariation · apriltuesday · Aug 22, 2024 · Aug 15, 2024 · Aug 16, 2024 · Aug 19, 2024
diff --git a/bin/cmat/VERSION b/bin/cmat/VERSION
@@ -1 +1 @@
-3.2.2
+3.3.0.dev
diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py
@@ -1,6 +1,7 @@
 import logging
 import re
 import xml.etree.ElementTree as ElementTree
+from functools import cached_property
 from xml.dom import minidom
 
 from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
@@ -22,42 +23,32 @@ class ClinVarRecord:
     # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
     NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
 
-    def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
+    def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
         """Initialise a ClinVar record object from an RCV XML record."""
-        self.rcv = rcv
+        self.record_xml = record_xml
         self.xsd_version = xsd_version
 
         # Add a list of traits
         self.trait_set = []
-        for trait in find_elements(self.rcv, './TraitSet/Trait'):
+        for trait in find_elements(self.record_xml, './TraitSet/Trait'):
             self.trait_set.append(trait_class(trait, self))
 
         # We are currently only processing MeasureSets of type Variant which are included directly in the RCV record.
         # Some other options (currently not supported) are:
         # * MeasureSet of types "Haplotype", "Phase unknown", or "Distinct chromosomes"
         # * GenotypeSet, which contains an assertion about a group of variants from different chromosome copies, with
         #   the type of be either a "CompoundHeterozygote" or a "Diplotype"
-        variant_measure = find_optional_unique_element(self.rcv, './MeasureSet[@Type="Variant"]/Measure')
+        variant_measure = find_optional_unique_element(self.record_xml, './MeasureSet[@Type="Variant"]/Measure')
         if not variant_measure:
             self.measure = None
         else:
             self.measure = measure_class(variant_measure, self)
 
-        # List of clinical classifications (Germline, Somatic, or Oncogenecity
-        self.clinical_classifications = []
-        if self.xsd_version < 2:
-            # V1 only ever has a single clinical classification / clinical significance
-            self.clinical_classifications.append(
-                ClinicalClassification(find_mandatory_unique_element(self.rcv, './ClinicalSignificance'), self))
-        else:
-            for clin_class in find_elements(self.rcv, './Classifications/*'):
-                self.clinical_classifications.append(ClinicalClassification(clin_class, self))
-
     def __str__(self):
         return f'ClinVarRecord object with accession {self.accession}'
 
     def write(self, output):
-        xml_str = minidom.parseString(ElementTree.tostring(self.rcv)).toprettyxml(indent='  ', encoding='utf-8')
+        xml_str = minidom.parseString(ElementTree.tostring(self.record_xml)).toprettyxml(indent='  ', encoding='utf-8')
         # version 3.8 adds superfluous root
         if xml_str.startswith(b'<?xml'):
             xml_str = re.sub(b'<\?xml.*?>', b'', xml_str)
@@ -67,28 +58,28 @@ def write(self, output):
 
     @property
     def accession(self):
-        return find_mandatory_unique_element(self.rcv, './ClinVarAccession').attrib['Acc']
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']
 
     @property
-    def date(self):
+    def last_updated_date(self):
         """This tracks the latest update date, counting even minor technical updates."""
-        return self.rcv.attrib['DateLastUpdated']
+        return self.record_xml.attrib['DateLastUpdated']
 
     @property
     def created_date(self):
         """This tracks the date the record was first made public on ClinVar."""
-        return self.rcv.attrib['DateCreated']
+        return self.record_xml.attrib['DateCreated']
 
     @property
     def mode_of_inheritance(self):
         """Return a (possibly empty) list of modes of inheritance for a given ClinVar record."""
         return sorted({
-            elem.text for elem in find_elements(self.rcv, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
+            elem.text for elem in find_elements(self.record_xml, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
         })
 
     @property
     def trait_set_type(self):
-        return find_mandatory_unique_element(self.rcv, './TraitSet').attrib['Type']
+        return find_mandatory_unique_element(self.record_xml, './TraitSet').attrib['Type']
 
     @property
     def traits(self):
@@ -106,17 +97,30 @@ def evidence_support_pubmed_refs(self):
         specific disease. These are the references displayed on the ClinVar website in the "Assertion and evidence
         details" section at the bottom of the page."""
         return [int(elem.text)
-                for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
+                for elem in find_elements(self.record_xml, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
 
     @property
     def allele_origins(self):
-        return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
+        return {elem.text for elem in find_elements(self.record_xml, './ObservedIn/Sample/Origin')}
 
     @property
     def valid_allele_origins(self):
         """Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
         return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}
 
+    @cached_property
+    def clinical_classifications(self):
+        """List of clinical classifications (Germline, Somatic, or Oncogenecity)"""
+        clinical_classifications = []
+        if self.xsd_version < 2:
+            # V1 only ever has a single clinical classification / clinical significance
+            clinical_classifications.append(
+                ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
+        else:
+            for clin_class in find_elements(self.record_xml, './Classifications/*'):
+                clinical_classifications.append(ClinicalClassification(clin_class, self))
+        return clinical_classifications
+
     # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
     # if there is exactly one ClinicalClassification for the record.
     # Otherwise these should be taken from the ClinicalClassification objects directly.

diff --git a/cmat/clinvar_xml_io/clinvar_set.py b/cmat/clinvar_xml_io/clinvar_set.py
@@ -0,0 +1,30 @@
+from cmat.clinvar_xml_io import ClinVarRecord
+from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements
+
+
+class ClinVarSet:
+    """
+    A ClinVarSet groups together a single reference record (RCV) and one or more submitted records (SCVs).
+    """
+
+    def __init__(self, cvs_xml, xsd_version):
+        self.cvs_xml = cvs_xml
+
+        rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
+        self.rcv = ClinVarRecord(rcv_elem, xsd_version)
+
+        scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
+        self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]
+
+    @property
+    def id(self):
+        return self.cvs_xml.attrib['ID']
+
+    @property
+    def title(self):
+        return find_mandatory_unique_element(self.cvs_xml, './Title').text
+
+    @property
+    def status(self):
+        return find_mandatory_unique_element(self.cvs_xml, './RecordStatus').text
diff --git a/cmat/clinvar_xml_io/clinvar_submitted_record.py b/cmat/clinvar_xml_io/clinvar_submitted_record.py
@@ -0,0 +1,61 @@
+import logging
+from functools import cached_property
+
+from cmat.clinvar_xml_io import ClinVarRecord
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class ClinVarSubmittedRecord(ClinVarRecord):
+    """
+    Submitted records (SCVs) are structured similarly to reference records (RCVs) with a few exceptions, though they
+    typically have fewer annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are
+    added by curators.
+
+    SCVs also contain additional information about the actual submission, which we model in this class.
+    """
+
+    def __init__(self, record_xml, xsd_version, reference_record):
+        super().__init__(record_xml, xsd_version)
+        # Each SCV is associated with a single RCV
+        self.reference_record = reference_record
+
+    def __str__(self):
+        return f'ClinVarSubmittedRecord object with accession {self.accession}'
+
+    @property
+    def submission_date(self):
+        """Date of submission or when submission was last revised (for first submission, use created_date)."""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate']
+
+    @property
+    def last_updated_date(self):
+        """Overrides parent definition, in SCV this date is in the accession element"""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated']
+
+    @property
+    def created_date(self):
+        """Overrides parent definition, in SCV this date is in the accession element"""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated']
+
+    @property
+    def submitter(self):
+        """Name of the submitting organization."""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter']
+
+    @property
+    def submitter_id(self):
+        """Numeric identifier associated with the submitting organization."""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['OrgID']
+
+    @property
+    def submission_name(self):
+        """Name or identifier associated with the submission. This is optional."""
+        return self.record_xml.attrib.get('SubmissionName', None)
+
+    @cached_property
+    def clinical_classifications(self):
+        # Submitted record clinical classifications are defined a bit differently than reference records
+        raise NotImplementedError('Clinical classification parsing not implemented for SCVs')
diff --git a/cmat/clinvar_xml_io/xml_parsing.py b/cmat/clinvar_xml_io/xml_parsing.py
@@ -30,18 +30,22 @@ def parse_header_attributes(clinvar_xml):
 
 def iterate_rcv_from_xml(clinvar_xml):
     """Iterates through the gzipped ClinVar XML and yields complete <ReferenceClinVarAssertion> records."""
+    for cvs in iterate_cvs_from_xml(clinvar_xml):
+        # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
+        # ClinVar. There should only be one such record per ClinVarSet.
+        rcv = find_mandatory_unique_element(cvs, 'ReferenceClinVarAssertion')
+        yield rcv
+
+
+def iterate_cvs_from_xml(clinvar_xml):
+    """Iterates through the gzipped ClinVar XML and yields complete <ClinVarSet> elements."""
     with gzip.open(clinvar_xml, 'rt') as fh:
         for event, elem in ElementTree.iterparse(fh):
             # Wait until we have built a complete ClinVarSet element
             if elem.tag != 'ClinVarSet':
                 continue
-
-            # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
-            # ClinVar. There should only be one such record per ClinVarSet.
-            rcv = find_mandatory_unique_element(elem, 'ReferenceClinVarAssertion')
-
             # Return the complete record and then remove the processed element from the tree to save memory
-            yield rcv
+            yield elem
             elem.clear()
 
 

diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py
@@ -213,8 +213,8 @@ def print_counter(counter):
 
 class AnnotatedClinVarRecord(ClinVarRecord):
 
-    def __init__(self, rcv, xsd_version):
-        super().__init__(rcv, xsd_version, trait_class=OntologyMappedClinVarTrait,
+    def __init__(self, record_xml, xsd_version):
+        super().__init__(record_xml, xsd_version, trait_class=OntologyMappedClinVarTrait,
                          measure_class=EnsemblAnnotatedClinVarMeasure)
 
 

diff --git a/data-exploration/clinvar-variant-types/clinvar-variant-types.py b/data-exploration/clinvar-variant-types/clinvar-variant-types.py
@@ -125,8 +125,8 @@ def rcv_to_link(rcv_id):
     rcv_id = clinvar_record.accession
 
     # RCV can contain either a MeasureSet, or a GenotypeSet. It must not contain both.
-    measure_sets = clinvar_record.rcv.findall('MeasureSet')
-    genotype_sets = clinvar_record.rcv.findall('GenotypeSet')
+    measure_sets = clinvar_record.record_xml.findall('MeasureSet')
+    genotype_sets = clinvar_record.record_xml.findall('GenotypeSet')
     if len(measure_sets) == 1 and len(genotype_sets) == 0:
         # Most common case. RCV directly contains one measure set.
         measure_set = measure_sets[0]

diff --git a/data-exploration/filter_clinvar_xml.py b/data-exploration/filter_clinvar_xml.py
@@ -4,7 +4,7 @@
 import xml.etree.ElementTree as ElementTree
 
 from cmat.clinvar_xml_io import ClinVarRecord
-from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, iterate_cvs_from_xml
 from cmat.output_generation.clinvar_to_evidence_strings import get_consequence_types
 from cmat.output_generation.consequence_type import process_consequence_type_file
 
@@ -18,18 +18,6 @@ def pprint(x):
     print(ElementTree.tostring(x, encoding='unicode'))
 
 
-def iterate_cvs_from_xml(clinvar_xml):
-    """Similar to iterate_rcv_from_xml in clinvar_xml_utils, but keeps the entire ClinVarSet XML element.
-    This allows us to construct a valid ClinVar XML for easy future processing."""
-    with gzip.open(clinvar_xml, 'rt') as fh:
-        for event, elem in ElementTree.iterparse(fh):
-            # Wait until we have built a complete ClinVarSet element
-            if elem.tag != 'ClinVarSet':
-                continue
-            yield elem
-            elem.clear()
-
-
 def filter_xml(input_xml, output_xml, filter_fct, max_num=None):
     """ Filter input_xml by boolean condition defined by filter_fct and write to output_xml.
     If max_num is given, will write at most max_num records, otherwise writes all."""

diff --git a/tests/clinvar_xml_io/test_clinvar_measure.py b/tests/clinvar_xml_io/test_clinvar_measure.py
@@ -0,0 +1,40 @@
+import os
+
+from cmat.clinvar_xml_io import ClinVarDataset
+
+resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
+
+
+class TestClinvarRecordMeasure:
+    @classmethod
+    def setup_class(cls):
+        input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
+        cls.test_crm = next(iter(ClinVarDataset(input_file))).measure
+
+    def test_hgvs(self):
+        text_hgvs = [h.text for h in self.test_crm.all_hgvs]
+        assert text_hgvs == ['NM_152443.3:c.677A>G',
+                             'NG_008321.1:g.32324A>G',
+                             'NC_000014.9:g.67729209A>G',
+                             'NC_000014.8:g.68195926A>G',
+                             'NM_152443.2:c.677A>G',
+                             'Q96NR8:p.Tyr226Cys',
+                             'NP_689656.2:p.Tyr226Cys']
+
+    def test_preferred_current_hgvs(self):
+        assert self.test_crm.preferred_current_hgvs.text == 'NC_000014.9:g.67729209A>G'
+
+    def test_rs(self):
+        assert self.test_crm.rs_id == 'rs28940313'
+
+    def test_nsv(self):
+        assert self.test_crm.nsv_id is None
+
+    def test_variant_type(self):
+        assert self.test_crm.variant_type == 'single nucleotide variant'
+
+    def test_measure_set_pubmed_refs(self):
+        assert self.test_crm.pubmed_refs == []
+
+    def test_so_terms(self):
+        assert self.test_crm.existing_so_terms == {'SO:0001583'}
diff --git a/tests/clinvar_xml_io/test_clinvar_record.py b/tests/clinvar_xml_io/test_clinvar_record.py
@@ -32,3 +32,45 @@ def test_multiple_clinical_classifications_record():
     assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'}
     with pytest.raises(MultipleClinicalClassificationsError):
         print(record.valid_clinical_significances)
+
+
+class TestClinvarRecord:
+    @classmethod
+    def setup_class(cls):
+        input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
+        cls.test_clinvar_record = next(iter(ClinVarDataset(input_file)))
+
+    def test_date(self):
+        """Check that the last updated date of the referenceClinVarAssertion is loaded correctly"""
+        assert self.test_clinvar_record.last_updated_date == '2024-04-15'
+
+    def test_score(self):
+        assert self.test_clinvar_record.score == 2
+
+    def test_review_status(self):
+        assert self.test_clinvar_record.review_status == 'criteria provided, multiple submitters, no conflicts'
+
+    def test_acc(self):
+        assert self.test_clinvar_record.accession == 'RCV000002127'
+
+    def test_traits(self):
+        assert self.test_clinvar_record.traits[0].preferred_name == 'Leber congenital amaurosis 13'
+        assert self.test_clinvar_record.traits[0].preferred_or_other_valid_name == 'Leber congenital amaurosis 13'
+
+    def test_trait_pubmed_refs(self):
+        assert self.test_clinvar_record.traits[0].pubmed_refs == [20301590, 30285347]
+
+    def test_observed_pubmed_refs(self):
+        assert self.test_clinvar_record.evidence_support_pubmed_refs == [15258582, 15322982]
+
+    def test_clinical_significance(self):
+        assert self.test_clinvar_record.clinical_significance_list == ['likely pathogenic', 'pathogenic']
+
+    def test_allele_origins(self):
+        assert self.test_clinvar_record.allele_origins == {'germline', 'inherited', 'unknown'}
+
+    def test_valid_allele_origins(self):
+        assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'}
+
+    def test_trait_efo_ids(self):
+        assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')]