Merge pull request #432 from apriltuesday/new-xml

Issue 396 - Migrate to ClinVar XML V2
EBIvariation · Jun 4, 2024 · 21e78db · 21e78db
2 parents 10ca8b3 + 0d51f7a
commit 21e78db
Show file tree

Hide file tree

Showing 36 changed files with 1,290 additions and 1,238 deletions.
diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
 ```
 You can use the `--include_transcripts` flag to also include transcript annotations with the functional consequences.
 
-By default, the pipeline will download and annotate the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
+By default, the pipeline will download and annotate the latest ClinVar RCV XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
 
 ### Trait curation
 
@@ -125,9 +125,9 @@ nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \
   -resume
 ```
 
-By default, the pipeline will download and map the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
+By default, the pipeline will download and map the latest ClinVar RCV XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
 
-To create the curation spreadsheet, first make your own copy of the [template](https://docs.google.com/spreadsheets/d/1PyDzRs3bO1klvvSv9XuHmx-x7nqZ0UAGeS6aV2SQ2Yg/edit?usp=sharing).
+To create the curation spreadsheet, first make your own copy of the [template](https://docs.google.com/spreadsheets/d/1GWAQAZjOpzsIkdCu0CSRDoehZEUB3VjZYYiHWp9Tn7Q/edit?usp=sharing).
 Then paste the contents of `${CURATION_ROOT}/google_sheets_table.tsv` into it, starting with column H “ClinVar label”.
 
 #### Manual curation

diff --git a/bin/cmat/VERSION b/bin/cmat/VERSION
@@ -1 +1 @@
-3.1.3
+3.2.0.dev0
diff --git a/cmat/clinvar_xml_io/clinical_classification.py b/cmat/clinvar_xml_io/clinical_classification.py
@@ -0,0 +1,82 @@
+import re
+
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_optional_unique_element
+
+
+class MultipleClinicalClassificationsError(NotImplementedError):
+    # Raised when we encounter multiples of clinical classifications or their attributes when not expected.
+    # This is new as of ClinVar XSD V2 and will be supported at some point in the future.
+    pass
+
+
+class ClinicalClassification:
+
+    # A score for the review status of the assigned clinical significance ranges from 0 to 4 and corresponds to the
+    # number of "gold stars" displayed on ClinVar website. See details here:
+    # https://www.ncbi.nlm.nih.gov/clinvar/docs/details/#review_status
+    score_map = {
+        'no assertion provided': 0,  # v1 only
+        'no classification provided': 0,
+        'no classification for the single variant': 0,
+        'no assertion criteria provided': 0,
+        'no classifications from unflagged records': 0,
+        'criteria provided, single submitter': 1,
+        'criteria provided, conflicting interpretations': 1,  # v1 only
+        'criteria provided, conflicting classifications': 1,
+        'criteria provided, multiple submitters, no conflicts': 2,
+        'reviewed by expert panel': 3,
+        'practice guideline': 4,
+    }
+
+    # Some records have been flagged by ClinVar and should not be used.
+    INVALID_CLINICAL_SIGNIFICANCES = {'no classifications from unflagged records'}
+
+    def __init__(self, class_xml, clinvar_record):
+        self.class_xml = class_xml
+        self.clinvar_record = clinvar_record
+        self.xsd_version = clinvar_record.xsd_version
+        # Type of clinical classification: germline, somatic, or oncogenicity
+        self.type = class_xml.tag
+
+    @property
+    def last_evaluated_date(self):
+        """This tracks the latest (re)evaluation date for the clinical interpretation.
+        See https://github.com/opentargets/platform/issues/1161#issuecomment-683938510 for details."""
+        if self.xsd_version < 2:
+            # The DateLastEvaluated attribute is not always present. In this case, this property will be None.
+            return self.class_xml.attrib.get('DateLastEvaluated')
+        return find_optional_unique_element(self.class_xml, './DateLastEvaluated')
+
+    @property
+    def review_status(self):
+        """Return a review status text for the assigned clinical significance. See score_map above for the list of
+        possible values."""
+        review_status = find_mandatory_unique_element(self.class_xml, './ReviewStatus').text
+        assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}'
+        return review_status
+
+    @property
+    def score(self):
+        """Return a score (star rating) for the assigned clinical significance. See score_map above."""
+        return self.score_map[self.review_status]
+
+    @property
+    def clinical_significance_raw(self):
+        """The original clinical significance string as stored in ClinVar. Example: 'Benign/Likely benign'."""
+        try:
+            return find_mandatory_unique_element(self.class_xml, './Description').text
+        except AssertionError as e:
+            raise MultipleClinicalClassificationsError(f'Found multiple descriptions for one ClinicalClassification in '
+                                      f'{self.clinvar_record.accession}')
+
+    @property
+    def clinical_significance_list(self):
+        """The normalised deduplicated list of all clinical significance values. The original value is (1) split into
+        multiple values by 3 delimiters: ('/', ', ', '; '), (2) converted into lowercase and (3) sorted
+        lexicographically. Example: 'Benign/Likely benign, risk_factor' → ['benign', 'likely benign', 'risk factor'].
+        See /data-exploration/clinvar-variant-types/README.md for further explanation."""
+        return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))
+
+    @property
+    def valid_clinical_significances(self):
+        return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNIFICANCES]
diff --git a/cmat/clinvar_xml_io/clinvar_dataset.py b/cmat/clinvar_xml_io/clinvar_dataset.py
@@ -1,5 +1,6 @@
 import gzip
 import logging
+import re
 from datetime import date
 
 from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
@@ -15,10 +16,21 @@ def __init__(self, clinvar_xml):
         self.clinvar_xml = clinvar_xml
         self.header_attr = parse_header_attributes(clinvar_xml)
         self.header_attr['LastProcessed'] = date.today().strftime('%Y-%m-%d')
+        self.xsd_version = self.get_xsd_version()
 
     def __iter__(self):
         for rcv in iterate_rcv_from_xml(self.clinvar_xml):
-            yield ClinVarRecord(rcv)
+            yield ClinVarRecord(rcv, self.xsd_version)
+
+    def get_xsd_version(self):
+        # For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md
+        if 'xsi:noNamespaceSchemaLocation' in self.header_attr:
+            url = self.header_attr['xsi:noNamespaceSchemaLocation']
+            m = re.search(r'(ClinVar_RCV|clinvar_public)_([0-9.]+)\.xsd', url, flags=re.IGNORECASE)
+            if m and m.group(2):
+                return float(m.group(2))
+        # If we cannot parse, assume v2
+        return 2.0
 
     def write_header(self, output_file):
         header = b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<ReleaseSet'

diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py
@@ -3,6 +3,7 @@
 import xml.etree.ElementTree as ElementTree
 from xml.dom import minidom
 
+from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
 from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure
 from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait
 from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \
@@ -18,28 +19,13 @@ class ClinVarRecord:
     * Issue https://github.com/EBIvariation/eva-opentargets/issues/127 for the most recent discussions on changing
       support of different ClinVar record types."""
 
-    # A score for the review status of the assigned clinical significance ranges from 0 to 4 and corresponds to the
-    # number of "gold stars" displayed on ClinVar website. See details here:
-    # https://www.ncbi.nlm.nih.gov/clinvar/docs/details/#review_status
-    score_map = {
-        "no assertion provided": 0,
-        'no assertion criteria provided': 0,
-        'no classifications from unflagged records': 0,
-        'criteria provided, single submitter': 1,
-        'criteria provided, conflicting interpretations': 1,
-        'criteria provided, multiple submitters, no conflicts': 2,
-        'reviewed by expert panel': 3,
-        'practice guideline': 4,
-    }
-
     # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
     NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
-    # Some records have been flagged by ClinVar and should not be used.
-    INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'}
 
-    def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
+    def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
         """Initialise a ClinVar record object from an RCV XML record."""
         self.rcv = rcv
+        self.xsd_version = xsd_version
 
         # Add a list of traits
         self.trait_set = []
@@ -57,6 +43,16 @@ def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMea
         else:
             self.measure = measure_class(variant_measure, self)
 
+        # List of clinical classifications (Germline, Somatic, or Oncogenecity
+        self.clinical_classifications = []
+        if self.xsd_version < 2:
+            # V1 only ever has a single clinical classification / clinical significance
+            self.clinical_classifications.append(
+                ClinicalClassification(find_mandatory_unique_element(self.rcv, './ClinicalSignificance'), self))
+        else:
+            for clin_class in find_elements(self.rcv, './Classifications/*'):
+                self.clinical_classifications.append(ClinicalClassification(clin_class, self))
+
     def __str__(self):
         return f'ClinVarRecord object with accession {self.accession}'
 
@@ -83,26 +79,6 @@ def created_date(self):
         """This tracks the date the record was first made public on ClinVar."""
         return self.rcv.attrib['DateCreated']
 
-    @property
-    def last_evaluated_date(self):
-        """This tracks the latest (re)evaluation date for the clinical interpretation.
-        See https://github.com/opentargets/platform/issues/1161#issuecomment-683938510 for details."""
-        # The DateLastEvaluated attribute is not always present. In this case, this property will be None.
-        return find_mandatory_unique_element(self.rcv, './ClinicalSignificance').attrib.get('DateLastEvaluated')
-
-    @property
-    def review_status(self):
-        """Return a review status text for the assigned clinical significance. See score_map above for the list of
-        possible values."""
-        review_status = find_mandatory_unique_element(self.rcv, './ClinicalSignificance/ReviewStatus').text
-        assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}'
-        return review_status
-
-    @property
-    def score(self):
-        """Return a score (star rating) for the assigned clinical significance. See score_map above."""
-        return self.score_map[self.review_status]
-
     @property
     def mode_of_inheritance(self):
         """Return a (possibly empty) list of modes of inheritance for a given ClinVar record."""
@@ -133,27 +109,44 @@ def evidence_support_pubmed_refs(self):
                 for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
 
     @property
-    def clinical_significance_raw(self):
-        """The original clinical significance string as stored in ClinVar. Example: 'Benign/Likely benign'."""
-        return find_mandatory_unique_element(self.rcv, './ClinicalSignificance/Description').text
+    def allele_origins(self):
+        return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
 
     @property
-    def clinical_significance_list(self):
-        """The normalised deduplicated list of all clinical significance values. The original value is (1) split into
-        multiple values by 3 delimiters: ('/', ', ', '; '), (2) converted into lowercase and (3) sorted
-        lexicographically. Example: 'Benign/Likely benign, risk_factor' → ['benign', 'likely benign', 'risk factor'].
-        See /data-exploration/clinvar-variant-types/README.md for further explanation."""
-        return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))
+    def valid_allele_origins(self):
+        """Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
+        return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}
+
+    # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
+    # if there is exactly one ClinicalClassification for the record.
+    # Otherwise these should be taken from the ClinicalClassification objects directly.
 
     @property
-    def valid_clinical_significances(self):
-        return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES]
+    def last_evaluated_date(self):
+        if len(self.clinical_classifications) > 1:
+            raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
+        return self.clinical_classifications[0].last_evaluated_date
 
     @property
-    def allele_origins(self):
-        return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
+    def review_status(self):
+        if len(self.clinical_classifications) > 1:
+            raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
+        return self.clinical_classifications[0].review_status
 
     @property
-    def valid_allele_origins(self):
-        """Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
-        return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}
+    def score(self):
+        if len(self.clinical_classifications) > 1:
+            raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
+        return self.clinical_classifications[0].score
+
+    @property
+    def clinical_significance_list(self):
+        if len(self.clinical_classifications) > 1:
+            raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
+        return self.clinical_classifications[0].clinical_significance_list
+
+    @property
+    def valid_clinical_significances(self):
+        if len(self.clinical_classifications) > 1:
+            raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
+        return self.clinical_classifications[0].valid_clinical_significances
diff --git a/cmat/clinvar_xml_io/xml_parsing.py b/cmat/clinvar_xml_io/xml_parsing.py
@@ -10,7 +10,7 @@ def parse_header_attributes(clinvar_xml):
     """Parses out attributes to the root-level ReleaseSet element and returns them as a dict."""
     attrib = None
     with gzip.open(clinvar_xml, 'rt') as fh:
-        for event, elem in ElementTree.iterparse(fh):
+        for event, elem in ElementTree.iterparse(fh, events=['start']):
             if elem.tag == 'ReleaseSet':
                 attrib = elem.attrib
                 break

diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py
@@ -68,7 +68,7 @@ def __iter__(self):
             self.mismatches_file.write('RCV\tCV\tCMAT\n')
 
         for rcv in iterate_rcv_from_xml(self.clinvar_xml):
-            record = AnnotatedClinVarRecord(rcv)
+            record = AnnotatedClinVarRecord(rcv, self.xsd_version)
             self.annotate(record)
             yield record
 
@@ -213,8 +213,9 @@ def print_counter(counter):
 
 class AnnotatedClinVarRecord(ClinVarRecord):
 
-    def __init__(self, rcv):
-        super().__init__(rcv, trait_class=OntologyMappedClinVarTrait, measure_class=EnsemblAnnotatedClinVarMeasure)
+    def __init__(self, rcv, xsd_version):
+        super().__init__(rcv, xsd_version, trait_class=OntologyMappedClinVarTrait,
+                         measure_class=EnsemblAnnotatedClinVarMeasure)
 
 
 class OntologyMappedClinVarTrait(ClinVarTrait):

diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -9,6 +9,7 @@
 import jsonschema
 
 from cmat.clinvar_xml_io import ClinVarDataset
+from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
 from cmat.output_generation import consequence_type as CT
 from cmat.output_generation.report import Report
 
@@ -77,6 +78,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
 
         # Catch any exceptions for this record so we can continue processing.
         try:
+            # Failure mode 0 (skip). Contains multiple clinical classification annotations.
+            # This is new as of V2 of the ClinVar XSD and should definitely be supported at some point,
+            # but as it can cause parsing complications we catch these cases first.
+            # See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396
+            if len(clinvar_record.clinical_classifications) > 1:
+                logger.warning(f'Found multiple clinical classifications in record {clinvar_record.accession}')
+                report.clinvar_skip_multiple_clinical_classifications += 1
+                continue
+
             # Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
             # potentially mappable name).
             if not clinvar_record.traits_with_valid_names:
@@ -153,6 +163,11 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
             report.complete_evidence_string_count += complete_evidence_strings_generated
             report.evidence_string_count += evidence_strings_generated
 
+        except MultipleClinicalClassificationsError as mcce:
+            # Ensure we catch any of these that fall through (e.g. from multiple description text)
+            logger.error(str(mcce))
+            report.clinvar_skip_multiple_clinical_classifications += 1
+
         except Exception as e:
             # We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
             # records and printing the report.