Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model for ClinVar submitted records (SCVs) #442

Merged
merged 4 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/cmat/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.2.2
3.3.0.dev
50 changes: 27 additions & 23 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import re
import xml.etree.ElementTree as ElementTree
from functools import cached_property
from xml.dom import minidom

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
Expand All @@ -22,42 +23,32 @@ class ClinVarRecord:
# Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}

def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
"""Initialise a ClinVar record object from an RCV XML record."""
self.rcv = rcv
self.record_xml = record_xml
self.xsd_version = xsd_version

# Add a list of traits
self.trait_set = []
for trait in find_elements(self.rcv, './TraitSet/Trait'):
for trait in find_elements(self.record_xml, './TraitSet/Trait'):
self.trait_set.append(trait_class(trait, self))

# We are currently only processing MeasureSets of type Variant which are included directly in the RCV record.
# Some other options (currently not supported) are:
# * MeasureSet of types "Haplotype", "Phase unknown", or "Distinct chromosomes"
# * GenotypeSet, which contains an assertion about a group of variants from different chromosome copies, with
# the type of be either a "CompoundHeterozygote" or a "Diplotype"
variant_measure = find_optional_unique_element(self.rcv, './MeasureSet[@Type="Variant"]/Measure')
variant_measure = find_optional_unique_element(self.record_xml, './MeasureSet[@Type="Variant"]/Measure')
if not variant_measure:
self.measure = None
else:
self.measure = measure_class(variant_measure, self)

# List of clinical classifications (Germline, Somatic, or Oncogenecity
self.clinical_classifications = []
if self.xsd_version < 2:
# V1 only ever has a single clinical classification / clinical significance
self.clinical_classifications.append(
ClinicalClassification(find_mandatory_unique_element(self.rcv, './ClinicalSignificance'), self))
else:
for clin_class in find_elements(self.rcv, './Classifications/*'):
self.clinical_classifications.append(ClinicalClassification(clin_class, self))

def __str__(self):
return f'ClinVarRecord object with accession {self.accession}'

def write(self, output):
xml_str = minidom.parseString(ElementTree.tostring(self.rcv)).toprettyxml(indent=' ', encoding='utf-8')
xml_str = minidom.parseString(ElementTree.tostring(self.record_xml)).toprettyxml(indent=' ', encoding='utf-8')
# version 3.8 adds superfluous root
if xml_str.startswith(b'<?xml'):
xml_str = re.sub(b'<\?xml.*?>', b'', xml_str)
Expand All @@ -67,28 +58,28 @@ def write(self, output):

@property
def accession(self):
return find_mandatory_unique_element(self.rcv, './ClinVarAccession').attrib['Acc']
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']

@property
def date(self):
def last_updated_date(self):
"""This tracks the latest update date, counting even minor technical updates."""
return self.rcv.attrib['DateLastUpdated']
return self.record_xml.attrib['DateLastUpdated']

@property
def created_date(self):
"""This tracks the date the record was first made public on ClinVar."""
return self.rcv.attrib['DateCreated']
return self.record_xml.attrib['DateCreated']

@property
def mode_of_inheritance(self):
"""Return a (possibly empty) list of modes of inheritance for a given ClinVar record."""
return sorted({
elem.text for elem in find_elements(self.rcv, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
elem.text for elem in find_elements(self.record_xml, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
})

@property
def trait_set_type(self):
return find_mandatory_unique_element(self.rcv, './TraitSet').attrib['Type']
return find_mandatory_unique_element(self.record_xml, './TraitSet').attrib['Type']

@property
def traits(self):
Expand All @@ -106,17 +97,30 @@ def evidence_support_pubmed_refs(self):
specific disease. These are the references displayed on the ClinVar website in the "Assertion and evidence
details" section at the bottom of the page."""
return [int(elem.text)
for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
for elem in find_elements(self.record_xml, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]

@property
def allele_origins(self):
return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
return {elem.text for elem in find_elements(self.record_xml, './ObservedIn/Sample/Origin')}

@property
def valid_allele_origins(self):
"""Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}

@cached_property
def clinical_classifications(self):
apriltuesday marked this conversation as resolved.
Show resolved Hide resolved
"""List of clinical classifications (Germline, Somatic, or Oncogenecity)"""
clinical_classifications = []
if self.xsd_version < 2:
# V1 only ever has a single clinical classification / clinical significance
clinical_classifications.append(
ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
else:
for clin_class in find_elements(self.record_xml, './Classifications/*'):
clinical_classifications.append(ClinicalClassification(clin_class, self))
return clinical_classifications

# The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
# if there is exactly one ClinicalClassification for the record.
# Otherwise these should be taken from the ClinicalClassification objects directly.
Expand Down
30 changes: 30 additions & 0 deletions cmat/clinvar_xml_io/clinvar_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements


class ClinVarSet:
"""
A ClinVarSet groups together a single reference record (RCV) and one or more submitted records (SCVs).
"""

def __init__(self, cvs_xml, xsd_version):
self.cvs_xml = cvs_xml

rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
self.rcv = ClinVarRecord(rcv_elem, xsd_version)

scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]

@property
def id(self):
return self.cvs_xml.attrib['ID']

@property
def title(self):
return find_mandatory_unique_element(self.cvs_xml, './Title').text

@property
def status(self):
return find_mandatory_unique_element(self.cvs_xml, './RecordStatus').text
61 changes: 61 additions & 0 deletions cmat/clinvar_xml_io/clinvar_submitted_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
from functools import cached_property

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class ClinVarSubmittedRecord(ClinVarRecord):
"""
Submitted records (SCVs) are structured similarly to reference records (RCVs) with a few exceptions, though they
typically have fewer annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are
added by curators.

SCVs also contain additional information about the actual submission, which we model in this class.
"""

def __init__(self, record_xml, xsd_version, reference_record):
super().__init__(record_xml, xsd_version)
# Each SCV is associated with a single RCV
self.reference_record = reference_record

def __str__(self):
return f'ClinVarSubmittedRecord object with accession {self.accession}'

@property
def submission_date(self):
"""Date of submission or when submission was last revised (for first submission, use created_date)."""
return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate']

@property
def last_updated_date(self):
"""Overrides parent definition, in SCV this date is in the accession element"""
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated']

@property
def created_date(self):
"""Overrides parent definition, in SCV this date is in the accession element"""
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated']

@property
def submitter(self):
"""Name of the submitting organization."""
return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter']

@property
def submitter_id(self):
"""Numeric identifier associated with the submitting organization."""
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['OrgID']

@property
def submission_name(self):
"""Name or identifier associated with the submission. This is optional."""
return self.record_xml.attrib.get('SubmissionName', None)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be used for the gene-related condition filtering, confirmed it's present in that case


@cached_property
def clinical_classifications(self):
# Submitted record clinical classifications are defined a bit differently than reference records
raise NotImplementedError('Clinical classification parsing not implemented for SCVs')
16 changes: 10 additions & 6 deletions cmat/clinvar_xml_io/xml_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,22 @@ def parse_header_attributes(clinvar_xml):

def iterate_rcv_from_xml(clinvar_xml):
"""Iterates through the gzipped ClinVar XML and yields complete <ReferenceClinVarAssertion> records."""
for cvs in iterate_cvs_from_xml(clinvar_xml):
# Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
# ClinVar. There should only be one such record per ClinVarSet.
rcv = find_mandatory_unique_element(cvs, 'ReferenceClinVarAssertion')
yield rcv


def iterate_cvs_from_xml(clinvar_xml):
"""Iterates through the gzipped ClinVar XML and yields complete <ClinVarSet> elements."""
with gzip.open(clinvar_xml, 'rt') as fh:
for event, elem in ElementTree.iterparse(fh):
# Wait until we have built a complete ClinVarSet element
if elem.tag != 'ClinVarSet':
continue

# Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
# ClinVar. There should only be one such record per ClinVarSet.
rcv = find_mandatory_unique_element(elem, 'ReferenceClinVarAssertion')

# Return the complete record and then remove the processed element from the tree to save memory
yield rcv
yield elem
elem.clear()


Expand Down
4 changes: 2 additions & 2 deletions cmat/output_generation/annotated_clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ def print_counter(counter):

class AnnotatedClinVarRecord(ClinVarRecord):

def __init__(self, rcv, xsd_version):
super().__init__(rcv, xsd_version, trait_class=OntologyMappedClinVarTrait,
def __init__(self, record_xml, xsd_version):
super().__init__(record_xml, xsd_version, trait_class=OntologyMappedClinVarTrait,
measure_class=EnsemblAnnotatedClinVarMeasure)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ def rcv_to_link(rcv_id):
rcv_id = clinvar_record.accession

# RCV can contain either a MeasureSet, or a GenotypeSet. It must not contain both.
measure_sets = clinvar_record.rcv.findall('MeasureSet')
genotype_sets = clinvar_record.rcv.findall('GenotypeSet')
measure_sets = clinvar_record.record_xml.findall('MeasureSet')
genotype_sets = clinvar_record.record_xml.findall('GenotypeSet')
if len(measure_sets) == 1 and len(genotype_sets) == 0:
# Most common case. RCV directly contains one measure set.
measure_set = measure_sets[0]
Expand Down
14 changes: 1 addition & 13 deletions data-exploration/filter_clinvar_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import xml.etree.ElementTree as ElementTree

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, iterate_cvs_from_xml
from cmat.output_generation.clinvar_to_evidence_strings import get_consequence_types
from cmat.output_generation.consequence_type import process_consequence_type_file

Expand All @@ -18,18 +18,6 @@ def pprint(x):
print(ElementTree.tostring(x, encoding='unicode'))


def iterate_cvs_from_xml(clinvar_xml):
"""Similar to iterate_rcv_from_xml in clinvar_xml_utils, but keeps the entire ClinVarSet XML element.
This allows us to construct a valid ClinVar XML for easy future processing."""
with gzip.open(clinvar_xml, 'rt') as fh:
for event, elem in ElementTree.iterparse(fh):
# Wait until we have built a complete ClinVarSet element
if elem.tag != 'ClinVarSet':
continue
yield elem
elem.clear()


def filter_xml(input_xml, output_xml, filter_fct, max_num=None):
""" Filter input_xml by boolean condition defined by filter_fct and write to output_xml.
If max_num is given, will write at most max_num records, otherwise writes all."""
Expand Down
40 changes: 40 additions & 0 deletions tests/clinvar_xml_io/test_clinvar_measure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os

from cmat.clinvar_xml_io import ClinVarDataset

resources_dir = os.path.join(os.path.dirname(__file__), 'resources')


class TestClinvarRecordMeasure:
@classmethod
def setup_class(cls):
input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
cls.test_crm = next(iter(ClinVarDataset(input_file))).measure

def test_hgvs(self):
text_hgvs = [h.text for h in self.test_crm.all_hgvs]
assert text_hgvs == ['NM_152443.3:c.677A>G',
'NG_008321.1:g.32324A>G',
'NC_000014.9:g.67729209A>G',
'NC_000014.8:g.68195926A>G',
'NM_152443.2:c.677A>G',
'Q96NR8:p.Tyr226Cys',
'NP_689656.2:p.Tyr226Cys']

def test_preferred_current_hgvs(self):
assert self.test_crm.preferred_current_hgvs.text == 'NC_000014.9:g.67729209A>G'

def test_rs(self):
assert self.test_crm.rs_id == 'rs28940313'

def test_nsv(self):
assert self.test_crm.nsv_id is None

def test_variant_type(self):
assert self.test_crm.variant_type == 'single nucleotide variant'

def test_measure_set_pubmed_refs(self):
assert self.test_crm.pubmed_refs == []

def test_so_terms(self):
assert self.test_crm.existing_so_terms == {'SO:0001583'}
42 changes: 42 additions & 0 deletions tests/clinvar_xml_io/test_clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,45 @@ def test_multiple_clinical_classifications_record():
assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'}
with pytest.raises(MultipleClinicalClassificationsError):
print(record.valid_clinical_significances)


class TestClinvarRecord:
@classmethod
def setup_class(cls):
input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
cls.test_clinvar_record = next(iter(ClinVarDataset(input_file)))

def test_date(self):
"""Check that the last updated date of the referenceClinVarAssertion is loaded correctly"""
assert self.test_clinvar_record.last_updated_date == '2024-04-15'

def test_score(self):
assert self.test_clinvar_record.score == 2

def test_review_status(self):
assert self.test_clinvar_record.review_status == 'criteria provided, multiple submitters, no conflicts'

def test_acc(self):
assert self.test_clinvar_record.accession == 'RCV000002127'

def test_traits(self):
assert self.test_clinvar_record.traits[0].preferred_name == 'Leber congenital amaurosis 13'
assert self.test_clinvar_record.traits[0].preferred_or_other_valid_name == 'Leber congenital amaurosis 13'

def test_trait_pubmed_refs(self):
assert self.test_clinvar_record.traits[0].pubmed_refs == [20301590, 30285347]

def test_observed_pubmed_refs(self):
assert self.test_clinvar_record.evidence_support_pubmed_refs == [15258582, 15322982]

def test_clinical_significance(self):
assert self.test_clinvar_record.clinical_significance_list == ['likely pathogenic', 'pathogenic']

def test_allele_origins(self):
assert self.test_clinvar_record.allele_origins == {'germline', 'inherited', 'unknown'}

def test_valid_allele_origins(self):
assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'}

def test_trait_efo_ids(self):
assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')]
Loading
Loading