Skip to content

Commit

Permalink
count multiple clinical classification records
Browse files Browse the repository at this point in the history
  • Loading branch information
apriltuesday committed May 24, 2024
1 parent b227f46 commit 6feecc8
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 14 deletions.
16 changes: 11 additions & 5 deletions cmat/clinvar_xml_io/clinical_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_optional_unique_element


class MultipleClinicalClassificationsError(NotImplementedError):
# Raised when we encounter multiples of clinical classifications or their attributes when not expected.
# This is new as of ClinVar XSD V2 and will be supported at some point in the future.
pass


class ClinicalClassification:

# A score for the review status of the assigned clinical significance ranges from 0 to 4 and corresponds to the
Expand All @@ -29,7 +35,8 @@ def __init__(self, class_xml, clinvar_record):
self.class_xml = class_xml
self.clinvar_record = clinvar_record
self.xsd_version = clinvar_record.xsd_version
# TODO log the type somewhere....
# Type of clinical classification: germline, somatic, or oncogenicity
self.type = class_xml.tag

@property
def last_evaluated_date(self):
Expand All @@ -45,8 +52,7 @@ def review_status(self):
"""Return a review status text for the assigned clinical significance. See score_map above for the list of
possible values."""
review_status = find_mandatory_unique_element(self.class_xml, './ReviewStatus').text
# TODO replace this assert with something less crash-y
# assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}'
assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}'
return review_status

@property
Expand All @@ -60,8 +66,8 @@ def clinical_significance_raw(self):
try:
return find_mandatory_unique_element(self.class_xml, './Description').text
except AssertionError as e:
# TODO log
return None
raise MultipleClinicalClassificationsError(f'Found multiple descriptions for one ClinicalClassification in '
f'{self.clinvar_record.accession}')

@property
def clinical_significance_list(self):
Expand Down
14 changes: 7 additions & 7 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import xml.etree.ElementTree as ElementTree
from xml.dom import minidom

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification
from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure
from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait
from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \
Expand Down Expand Up @@ -119,34 +119,34 @@ def valid_allele_origins(self):

# The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
# if there is exactly one ClinicalClassification for the record.
# Otherwise these are best taken from the ClinicalClassification objects directly.
# Otherwise these should be taken from the ClinicalClassification objects directly.

@property
def last_evaluated_date(self):
if len(self.clinical_classifications) > 1:
return None
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].last_evaluated_date

@property
def review_status(self):
if len(self.clinical_classifications) > 1:
return None
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].review_status

@property
def score(self):
if len(self.clinical_classifications) > 1:
return None
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].score

@property
def clinical_significance_list(self):
if len(self.clinical_classifications) > 1:
return None
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].clinical_significance_list

@property
def valid_clinical_significances(self):
if len(self.clinical_classifications) > 1:
return None
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].valid_clinical_significances
15 changes: 15 additions & 0 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import jsonschema

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

Expand Down Expand Up @@ -77,6 +78,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

# Catch any exceptions for this record so we can continue processing.
try:
# Failure mode 0 (skip). Contains multiple clinical classification annotations.
# This is new as of V2 of the ClinVar XSD and should definitely be supported at some point,
# but as it can cause parsing complications we catch these cases first.
# See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396
if len(clinvar_record.clinical_classifications) > 1:
logger.warning(f'Found multiple clinical classifications in record {clinvar_record.accession}')
report.clinvar_skip_multiple_clinical_classifications += 1
continue

# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# potentially mappable name).
if not clinvar_record.traits_with_valid_names:
Expand Down Expand Up @@ -153,6 +163,11 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
report.complete_evidence_string_count += complete_evidence_strings_generated
report.evidence_string_count += evidence_strings_generated

except MultipleClinicalClassificationsError as mcce:
# Ensure we catch any of these that fall through (e.g. from multiple description text)
logger.error(str(mcce))
report.clinvar_skip_multiple_clinical_classifications += 1

except Exception as e:
# We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
# records and printing the report.
Expand Down
5 changes: 4 additions & 1 deletion cmat/output_generation/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
self.clinvar_skip_invalid_evidence_string = 0
self.clinvar_skip_multiple_clinical_classifications = 0
self.clinvar_done_one_complete_evidence_string = 0
self.clinvar_done_multiple_complete_evidence_strings = 0
self.clinvar_fatal = 0
Expand Down Expand Up @@ -89,7 +90,8 @@ def compute_record_tallies(self):
"""Compute tallies of records fatal/skipped/done based on the more granular counts."""
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance
self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string +
self.clinvar_skip_multiple_clinical_classifications)
self.clinvar_done = (self.clinvar_done_one_complete_evidence_string +
self.clinvar_done_multiple_complete_evidence_strings)

Expand Down Expand Up @@ -118,6 +120,7 @@ def print_report(self):
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
Multiple clinical classifications\t{self.clinvar_skip_multiple_clinical_classifications}
Done: Generated at least one complete evidence string\t{self.clinvar_done}
One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
Expand Down
Binary file not shown.
Binary file modified tests/clinvar_xml_io/resources/test_clinvar_dataset.xml.gz
Binary file not shown.
19 changes: 19 additions & 0 deletions tests/clinvar_xml_io/test_clinvar_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os

import pytest

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError

resources_dir = os.path.join(os.path.dirname(__file__), 'resources')


def test_multiple_clinical_classifications_record():
# input dataset with only one record
input_file = os.path.join(resources_dir, 'multiple_classifications.xml.gz')
record = next(iter(ClinVarDataset(input_file)))

assert len(record.clinical_classifications) == 2
assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'}
with pytest.raises(MultipleClinicalClassificationsError):
print(record.valid_clinical_significances)
2 changes: 1 addition & 1 deletion tests/clinvar_xml_io/test_xml_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ def test_parse_header_attributes():
input_file = os.path.join(resources_dir, 'test_clinvar_dataset.xml.gz')
header_attr = parse_header_attributes(input_file)
assert header_attr['Dated'] == '2023-02-22'
assert header_attr['xsi:noNamespaceSchemaLocation'] == 'http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.60.xsd'
assert header_attr['xsi:noNamespaceSchemaLocation'] == 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/RCV/ClinVar_RCV_2.0.xsd'

0 comments on commit 6feecc8

Please sign in to comment.