EBIvariation · apriltuesday · Aug 22, 2024 · Aug 15, 2024 · Aug 16, 2024 · Aug 19, 2024
diff --git a/bin/cmat/VERSION b/bin/cmat/VERSION
@@ -1 +1 @@
-3.2.2
+3.3.0.dev
diff --git a/cmat/clinvar_xml_io/clinvar_dataset.py b/cmat/clinvar_xml_io/clinvar_dataset.py
@@ -3,7 +3,7 @@
 import re
 from datetime import date
 
-from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
+from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
 from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes
 
 logger = logging.getLogger(__name__)
@@ -20,7 +20,7 @@ def __init__(self, clinvar_xml):
 
     def __iter__(self):
         for rcv in iterate_rcv_from_xml(self.clinvar_xml):
-            yield ClinVarRecord(rcv, self.xsd_version)
+            yield ClinVarReferenceRecord(rcv, self.xsd_version)
 
     def get_xsd_version(self):
         # For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md

diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py
@@ -1,9 +1,10 @@
 import logging
 import re
 import xml.etree.ElementTree as ElementTree
+from functools import cached_property
 from xml.dom import minidom
 
-from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
+from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
 from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure
 from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait
 from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \
@@ -14,50 +15,40 @@
 
 
 class ClinVarRecord:
-    """Instances of this class hold data on individual ClinVar records. See also:
-    * /data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model;
-    * Issue https://github.com/EBIvariation/eva-opentargets/issues/127 for the most recent discussions on changing
-      support of different ClinVar record types."""
+    """
+    Base class for both reference and submitted records in ClinVar. See also:
+    /data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model
+    """
 
     # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
     NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
 
-    def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
+    def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
         """Initialise a ClinVar record object from an RCV XML record."""
-        self.rcv = rcv
+        self.record_xml = record_xml
         self.xsd_version = xsd_version
 
         # Add a list of traits
         self.trait_set = []
-        for trait in find_elements(self.rcv, './TraitSet/Trait'):
+        for trait in find_elements(self.record_xml, './TraitSet/Trait'):
             self.trait_set.append(trait_class(trait, self))
 
         # We are currently only processing MeasureSets of type Variant which are included directly in the RCV record.
         # Some other options (currently not supported) are:
         # * MeasureSet of types "Haplotype", "Phase unknown", or "Distinct chromosomes"
         # * GenotypeSet, which contains an assertion about a group of variants from different chromosome copies, with
         #   the type of be either a "CompoundHeterozygote" or a "Diplotype"
-        variant_measure = find_optional_unique_element(self.rcv, './MeasureSet[@Type="Variant"]/Measure')
+        variant_measure = find_optional_unique_element(self.record_xml, './MeasureSet[@Type="Variant"]/Measure')
         if not variant_measure:
             self.measure = None
         else:
             self.measure = measure_class(variant_measure, self)
 
-        # List of clinical classifications (Germline, Somatic, or Oncogenecity
-        self.clinical_classifications = []
-        if self.xsd_version < 2:
-            # V1 only ever has a single clinical classification / clinical significance
-            self.clinical_classifications.append(
-                ClinicalClassification(find_mandatory_unique_element(self.rcv, './ClinicalSignificance'), self))
-        else:
-            for clin_class in find_elements(self.rcv, './Classifications/*'):
-                self.clinical_classifications.append(ClinicalClassification(clin_class, self))
-
     def __str__(self):
         return f'ClinVarRecord object with accession {self.accession}'
 
     def write(self, output):
-        xml_str = minidom.parseString(ElementTree.tostring(self.rcv)).toprettyxml(indent='  ', encoding='utf-8')
+        xml_str = minidom.parseString(ElementTree.tostring(self.record_xml)).toprettyxml(indent='  ', encoding='utf-8')
         # version 3.8 adds superfluous root
         if xml_str.startswith(b'<?xml'):
             xml_str = re.sub(b'<\?xml.*?>', b'', xml_str)
@@ -67,28 +58,30 @@ def write(self, output):
 
     @property
     def accession(self):
-        return find_mandatory_unique_element(self.rcv, './ClinVarAccession').attrib['Acc']
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']
 
     @property
-    def date(self):
-        """This tracks the latest update date, counting even minor technical updates."""
-        return self.rcv.attrib['DateLastUpdated']
+    def last_updated_date(self):
+        """This tracks the latest update date, counting even minor technical updates.
+        Appears differently in reference and submitted records."""
+        raise NotImplementedError
 
     @property
     def created_date(self):
-        """This tracks the date the record was first made public on ClinVar."""
-        return self.rcv.attrib['DateCreated']
+        """This tracks the date the record was first made public on ClinVar.
+        Appears differently in reference and submitted records."""
+        raise NotImplementedError
 
     @property
     def mode_of_inheritance(self):
         """Return a (possibly empty) list of modes of inheritance for a given ClinVar record."""
         return sorted({
-            elem.text for elem in find_elements(self.rcv, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
+            elem.text for elem in find_elements(self.record_xml, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
         })
 
     @property
     def trait_set_type(self):
-        return find_mandatory_unique_element(self.rcv, './TraitSet').attrib['Type']
+        return find_mandatory_unique_element(self.record_xml, './TraitSet').attrib['Type']
 
     @property
     def traits(self):
@@ -106,17 +99,22 @@ def evidence_support_pubmed_refs(self):
         specific disease. These are the references displayed on the ClinVar website in the "Assertion and evidence
         details" section at the bottom of the page."""
         return [int(elem.text)
-                for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
+                for elem in find_elements(self.record_xml, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
 
     @property
     def allele_origins(self):
-        return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
+        return {elem.text for elem in find_elements(self.record_xml, './ObservedIn/Sample/Origin')}
 
     @property
     def valid_allele_origins(self):
         """Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
         return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}
 
+    @cached_property
+    def clinical_classifications(self):
+        """List of clinical classifications (Germline, Somatic, or Oncogenecity)"""
+        raise NotImplementedError
+
     # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
     # if there is exactly one ClinicalClassification for the record.
     # Otherwise these should be taken from the ClinicalClassification objects directly.

diff --git a/cmat/clinvar_xml_io/clinvar_reference_record.py b/cmat/clinvar_xml_io/clinvar_reference_record.py
@@ -0,0 +1,41 @@
+import logging
+from functools import cached_property
+
+from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification
+
+from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class ClinVarReferenceRecord(ClinVarRecord):
+    """Reference records (RCVs) summarise information from submitted records (SCVs) and include additional annotations
+    and cross-references supplied by ClinVar."""
+
+    def __init__(self, record_xml, xsd_version):
+        super().__init__(record_xml, xsd_version)
+
+    def __str__(self):
+        return f'ClinVarReferenceRecord object with accession {self.accession}'
+
+    @property
+    def last_updated_date(self):
+        return self.record_xml.attrib['DateLastUpdated']
+
+    @property
+    def created_date(self):
+        return self.record_xml.attrib['DateCreated']
+
+    @cached_property
+    def clinical_classifications(self):
+        clinical_classifications = []
+        if self.xsd_version < 2:
+            # V1 only ever has a single clinical classification / clinical significance
+            clinical_classifications.append(
+                ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
+        else:
+            for clin_class in find_elements(self.record_xml, './Classifications/*'):
+                clinical_classifications.append(ClinicalClassification(clin_class, self))
+        return clinical_classifications
diff --git a/cmat/clinvar_xml_io/clinvar_set.py b/cmat/clinvar_xml_io/clinvar_set.py
@@ -0,0 +1,30 @@
+from cmat.clinvar_xml_io import ClinVarRecord
+from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements
+
+
+class ClinVarSet:
+    """
+    A ClinVarSet groups together a single reference record (RCV) and one or more submitted records (SCVs).
+    """
+
+    def __init__(self, cvs_xml, xsd_version):
+        self.cvs_xml = cvs_xml
+
+        rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
+        self.rcv = ClinVarRecord(rcv_elem, xsd_version)
+
+        scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
+        self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]
+
+    @property
+    def id(self):
+        return self.cvs_xml.attrib['ID']
+
+    @property
+    def title(self):
+        return find_mandatory_unique_element(self.cvs_xml, './Title').text
+
+    @property
+    def status(self):
+        return find_mandatory_unique_element(self.cvs_xml, './RecordStatus').text
diff --git a/cmat/clinvar_xml_io/clinvar_submitted_record.py b/cmat/clinvar_xml_io/clinvar_submitted_record.py
@@ -0,0 +1,59 @@
+import logging
+from functools import cached_property
+
+from cmat.clinvar_xml_io import ClinVarRecord
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class ClinVarSubmittedRecord(ClinVarRecord):
+    """
+    Submitted records (SCVs) are structured similarly to reference records (RCVs) with a few exceptions, though they
+    typically have fewer annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are
+    added by curators.
+
+    SCVs also contain additional information about the actual submission, which we model in this class.
+    """
+
+    def __init__(self, record_xml, xsd_version, reference_record):
+        super().__init__(record_xml, xsd_version)
+        # Each SCV is associated with a single RCV
+        self.reference_record = reference_record
+
+    def __str__(self):
+        return f'ClinVarSubmittedRecord object with accession {self.accession}'
+
+    @property
+    def submission_date(self):
+        """Date of submission or when submission was last revised (for first submission, use created_date)."""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate']
+
+    @property
+    def last_updated_date(self):
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated']
+
+    @property
+    def created_date(self):
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated']
+
+    @property
+    def submitter(self):
+        """Name of the submitting organization."""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter']
+
+    @property
+    def submitter_id(self):
+        """Numeric identifier associated with the submitting organization."""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['OrgID']
+
+    @property
+    def submission_name(self):
+        """Name or identifier associated with the submission. This is optional."""
+        return self.record_xml.attrib.get('SubmissionName', None)
+
+    @cached_property
+    def clinical_classifications(self):
+        # Submitted record clinical classifications are defined a bit differently than reference records
+        raise NotImplementedError('Clinical classification parsing not implemented for SCVs')
diff --git a/cmat/clinvar_xml_io/xml_parsing.py b/cmat/clinvar_xml_io/xml_parsing.py
@@ -30,18 +30,22 @@ def parse_header_attributes(clinvar_xml):
 
 def iterate_rcv_from_xml(clinvar_xml):
     """Iterates through the gzipped ClinVar XML and yields complete <ReferenceClinVarAssertion> records."""
+    for cvs in iterate_cvs_from_xml(clinvar_xml):
+        # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
+        # ClinVar. There should only be one such record per ClinVarSet.
+        rcv = find_mandatory_unique_element(cvs, 'ReferenceClinVarAssertion')
+        yield rcv
+
+
+def iterate_cvs_from_xml(clinvar_xml):
+    """Iterates through the gzipped ClinVar XML and yields complete <ClinVarSet> elements."""
     with gzip.open(clinvar_xml, 'rt') as fh:
         for event, elem in ElementTree.iterparse(fh):
             # Wait until we have built a complete ClinVarSet element
             if elem.tag != 'ClinVarSet':
                 continue
-
-            # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
-            # ClinVar. There should only be one such record per ClinVarSet.
-            rcv = find_mandatory_unique_element(elem, 'ReferenceClinVarAssertion')
-
             # Return the complete record and then remove the processed element from the tree to save memory
-            yield rcv
+            yield elem
             elem.clear()
 
 

diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py
@@ -213,8 +213,8 @@ def print_counter(counter):
 
 class AnnotatedClinVarRecord(ClinVarRecord):
 
-    def __init__(self, rcv, xsd_version):
-        super().__init__(rcv, xsd_version, trait_class=OntologyMappedClinVarTrait,
+    def __init__(self, record_xml, xsd_version):
+        super().__init__(record_xml, xsd_version, trait_class=OntologyMappedClinVarTrait,
                          measure_class=EnsemblAnnotatedClinVarMeasure)
 
 

diff --git a/data-exploration/clinvar-variant-types/clinvar-variant-types.py b/data-exploration/clinvar-variant-types/clinvar-variant-types.py
@@ -125,8 +125,8 @@ def rcv_to_link(rcv_id):
     rcv_id = clinvar_record.accession
 
     # RCV can contain either a MeasureSet, or a GenotypeSet. It must not contain both.
-    measure_sets = clinvar_record.rcv.findall('MeasureSet')
-    genotype_sets = clinvar_record.rcv.findall('GenotypeSet')
+    measure_sets = clinvar_record.record_xml.findall('MeasureSet')
+    genotype_sets = clinvar_record.record_xml.findall('GenotypeSet')
     if len(measure_sets) == 1 and len(genotype_sets) == 0:
         # Most common case. RCV directly contains one measure set.
         measure_set = measure_sets[0]

diff --git a/data-exploration/drug-response/drug-response-background-trait.ipynb b/data-exploration/drug-response/drug-response-background-trait.ipynb
@@ -3087,4 +3087,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/data-exploration/filter_clinvar_xml.py b/data-exploration/filter_clinvar_xml.py
@@ -4,7 +4,7 @@
 import xml.etree.ElementTree as ElementTree
 
 from cmat.clinvar_xml_io import ClinVarRecord
-from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, iterate_cvs_from_xml
 from cmat.output_generation.clinvar_to_evidence_strings import get_consequence_types
 from cmat.output_generation.consequence_type import process_consequence_type_file
 
@@ -18,18 +18,6 @@ def pprint(x):
     print(ElementTree.tostring(x, encoding='unicode'))
 
 
-def iterate_cvs_from_xml(clinvar_xml):
-    """Similar to iterate_rcv_from_xml in clinvar_xml_utils, but keeps the entire ClinVarSet XML element.
-    This allows us to construct a valid ClinVar XML for easy future processing."""
-    with gzip.open(clinvar_xml, 'rt') as fh:
-        for event, elem in ElementTree.iterparse(fh):
-            # Wait until we have built a complete ClinVarSet element
-            if elem.tag != 'ClinVarSet':
-                continue
-            yield elem
-            elem.clear()
-
-
 def filter_xml(input_xml, output_xml, filter_fct, max_num=None):
     """ Filter input_xml by boolean condition defined by filter_fct and write to output_xml.
     If max_num is given, will write at most max_num records, otherwise writes all."""
-Original file line number
+Diff line change
@@ Expand Up / @@ -3087,4 +3087,4 @@ @@
      },
      "nbformat": 4,
      "nbformat_minor": 5
-    }
+    }