diff --git a/cmat/clinvar_xml_io/clinical_classification.py b/cmat/clinvar_xml_io/clinical_classification.py index dd99e5dc..33feaa18 100644 --- a/cmat/clinvar_xml_io/clinical_classification.py +++ b/cmat/clinvar_xml_io/clinical_classification.py @@ -3,6 +3,12 @@ from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_optional_unique_element +class MultipleClinicalClassificationsError(NotImplementedError): + # Raised when we encounter multiples of clinical classifications or their attributes when not expected. + # This is new as of ClinVar XSD V2 and will be supported at some point in the future. + pass + + class ClinicalClassification: # A score for the review status of the assigned clinical significance ranges from 0 to 4 and corresponds to the @@ -29,7 +35,8 @@ def __init__(self, class_xml, clinvar_record): self.class_xml = class_xml self.clinvar_record = clinvar_record self.xsd_version = clinvar_record.xsd_version - # TODO log the type somewhere.... + # Type of clinical classification: germline, somatic, or oncogenicity + self.type = class_xml.tag @property def last_evaluated_date(self): @@ -45,8 +52,7 @@ def review_status(self): """Return a review status text for the assigned clinical significance. See score_map above for the list of possible values.""" review_status = find_mandatory_unique_element(self.class_xml, './ReviewStatus').text - # TODO replace this assert with something less crash-y - # assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}' + assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}' return review_status @property @@ -60,8 +66,8 @@ def clinical_significance_raw(self): try: return find_mandatory_unique_element(self.class_xml, './Description').text except AssertionError as e: - # TODO log - return None + raise MultipleClinicalClassificationsError(f'Found multiple descriptions for one ClinicalClassification in ' + f'{self.clinvar_record.accession}') @property def clinical_significance_list(self): diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index ffbb906e..72c28cb4 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -3,7 +3,7 @@ import xml.etree.ElementTree as ElementTree from xml.dom import minidom -from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification +from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \ @@ -119,34 +119,34 @@ def valid_allele_origins(self): # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord # if there is exactly one ClinicalClassification for the record. - # Otherwise these are best taken from the ClinicalClassification objects directly. + # Otherwise these should be taken from the ClinicalClassification objects directly. @property def last_evaluated_date(self): if len(self.clinical_classifications) > 1: - return None + raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}') return self.clinical_classifications[0].last_evaluated_date @property def review_status(self): if len(self.clinical_classifications) > 1: - return None + raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}') return self.clinical_classifications[0].review_status @property def score(self): if len(self.clinical_classifications) > 1: - return None + raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}') return self.clinical_classifications[0].score @property def clinical_significance_list(self): if len(self.clinical_classifications) > 1: - return None + raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}') return self.clinical_classifications[0].clinical_significance_list @property def valid_clinical_significances(self): if len(self.clinical_classifications) > 1: - return None + raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}') return self.clinical_classifications[0].valid_clinical_significances diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index b1cb25c2..a9c4c16c 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -9,6 +9,7 @@ import jsonschema from cmat.clinvar_xml_io import ClinVarDataset +from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError from cmat.output_generation import consequence_type as CT from cmat.output_generation.report import Report @@ -77,6 +78,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings # Catch any exceptions for this record so we can continue processing. try: + # Failure mode 0 (skip). Contains multiple clinical classification annotations. + # This is new as of V2 of the ClinVar XSD and should definitely be supported at some point, + # but as it can cause parsing complications we catch these cases first. + # See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396 + if len(clinvar_record.clinical_classifications) > 1: + logger.warning(f'Found multiple clinical classifications in record {clinvar_record.accession}') + report.clinvar_skip_multiple_clinical_classifications += 1 + continue + # Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid, # potentially mappable name). if not clinvar_record.traits_with_valid_names: @@ -153,6 +163,11 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings report.complete_evidence_string_count += complete_evidence_strings_generated report.evidence_string_count += evidence_strings_generated + except MultipleClinicalClassificationsError as mcce: + # Ensure we catch any of these that fall through (e.g. from multiple description text) + logger.error(str(mcce)) + report.clinvar_skip_multiple_clinical_classifications += 1 + except Exception as e: # We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all # records and printing the report. diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py index b2403c7b..714a502e 100644 --- a/cmat/output_generation/report.py +++ b/cmat/output_generation/report.py @@ -31,6 +31,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None): self.clinvar_skip_no_functional_consequences = 0 self.clinvar_skip_missing_efo_mapping = 0 self.clinvar_skip_invalid_evidence_string = 0 + self.clinvar_skip_multiple_clinical_classifications = 0 self.clinvar_done_one_complete_evidence_string = 0 self.clinvar_done_multiple_complete_evidence_strings = 0 self.clinvar_fatal = 0 @@ -89,7 +90,8 @@ def compute_record_tallies(self): """Compute tallies of records fatal/skipped/done based on the more granular counts.""" self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + - self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) + self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string + + self.clinvar_skip_multiple_clinical_classifications) self.clinvar_done = (self.clinvar_done_one_complete_evidence_string + self.clinvar_done_multiple_complete_evidence_strings) @@ -118,6 +120,7 @@ def print_report(self): No functional consequences\t{self.clinvar_skip_no_functional_consequences} Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping} Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string} + Multiple clinical classifications\t{self.clinvar_skip_multiple_clinical_classifications} Done: Generated at least one complete evidence string\t{self.clinvar_done} One complete evidence string\t{self.clinvar_done_one_complete_evidence_string} Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings} diff --git a/tests/clinvar_xml_io/resources/multiple_classifications.xml.gz b/tests/clinvar_xml_io/resources/multiple_classifications.xml.gz new file mode 100644 index 00000000..72bb2c3f Binary files /dev/null and b/tests/clinvar_xml_io/resources/multiple_classifications.xml.gz differ diff --git a/tests/clinvar_xml_io/resources/test_clinvar_dataset.xml.gz b/tests/clinvar_xml_io/resources/test_clinvar_dataset.xml.gz index f00d34bf..ba6271fd 100644 Binary files a/tests/clinvar_xml_io/resources/test_clinvar_dataset.xml.gz and b/tests/clinvar_xml_io/resources/test_clinvar_dataset.xml.gz differ diff --git a/tests/clinvar_xml_io/test_clinvar_record.py b/tests/clinvar_xml_io/test_clinvar_record.py new file mode 100644 index 00000000..0db57f90 --- /dev/null +++ b/tests/clinvar_xml_io/test_clinvar_record.py @@ -0,0 +1,19 @@ +import os + +import pytest + +from cmat.clinvar_xml_io import ClinVarDataset +from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError + +resources_dir = os.path.join(os.path.dirname(__file__), 'resources') + + +def test_multiple_clinical_classifications_record(): + # input dataset with only one record + input_file = os.path.join(resources_dir, 'multiple_classifications.xml.gz') + record = next(iter(ClinVarDataset(input_file))) + + assert len(record.clinical_classifications) == 2 + assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'} + with pytest.raises(MultipleClinicalClassificationsError): + print(record.valid_clinical_significances) diff --git a/tests/clinvar_xml_io/test_xml_parsing.py b/tests/clinvar_xml_io/test_xml_parsing.py index 3b734e06..6ac96677 100644 --- a/tests/clinvar_xml_io/test_xml_parsing.py +++ b/tests/clinvar_xml_io/test_xml_parsing.py @@ -10,4 +10,4 @@ def test_parse_header_attributes(): input_file = os.path.join(resources_dir, 'test_clinvar_dataset.xml.gz') header_attr = parse_header_attributes(input_file) assert header_attr['Dated'] == '2023-02-22' - assert header_attr['xsi:noNamespaceSchemaLocation'] == 'http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.60.xsd' + assert header_attr['xsi:noNamespaceSchemaLocation'] == 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/RCV/ClinVar_RCV_2.0.xsd'