Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model for ClinVar submitted records (SCVs) #442

Merged
merged 4 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/cmat/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.2.2
3.3.0.dev
4 changes: 2 additions & 2 deletions cmat/clinvar_xml_io/clinvar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from datetime import date

from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes

logger = logging.getLogger(__name__)
Expand All @@ -20,7 +20,7 @@ def __init__(self, clinvar_xml):

def __iter__(self):
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
yield ClinVarRecord(rcv, self.xsd_version)
yield ClinVarReferenceRecord(rcv, self.xsd_version)

def get_xsd_version(self):
# For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md
Expand Down
58 changes: 28 additions & 30 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import re
import xml.etree.ElementTree as ElementTree
from functools import cached_property
from xml.dom import minidom

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure
from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait
from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \
Expand All @@ -14,50 +15,40 @@


class ClinVarRecord:
"""Instances of this class hold data on individual ClinVar records. See also:
* /data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model;
* Issue https://github.com/EBIvariation/eva-opentargets/issues/127 for the most recent discussions on changing
support of different ClinVar record types."""
"""
Base class for both reference and submitted records in ClinVar. See also:
/data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model
"""

# Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}

def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
"""Initialise a ClinVar record object from an RCV XML record."""
self.rcv = rcv
self.record_xml = record_xml
self.xsd_version = xsd_version

# Add a list of traits
self.trait_set = []
for trait in find_elements(self.rcv, './TraitSet/Trait'):
for trait in find_elements(self.record_xml, './TraitSet/Trait'):
self.trait_set.append(trait_class(trait, self))

# We are currently only processing MeasureSets of type Variant which are included directly in the RCV record.
# Some other options (currently not supported) are:
# * MeasureSet of types "Haplotype", "Phase unknown", or "Distinct chromosomes"
# * GenotypeSet, which contains an assertion about a group of variants from different chromosome copies, with
# the type of be either a "CompoundHeterozygote" or a "Diplotype"
variant_measure = find_optional_unique_element(self.rcv, './MeasureSet[@Type="Variant"]/Measure')
variant_measure = find_optional_unique_element(self.record_xml, './MeasureSet[@Type="Variant"]/Measure')
if not variant_measure:
self.measure = None
else:
self.measure = measure_class(variant_measure, self)

# List of clinical classifications (Germline, Somatic, or Oncogenecity
self.clinical_classifications = []
if self.xsd_version < 2:
# V1 only ever has a single clinical classification / clinical significance
self.clinical_classifications.append(
ClinicalClassification(find_mandatory_unique_element(self.rcv, './ClinicalSignificance'), self))
else:
for clin_class in find_elements(self.rcv, './Classifications/*'):
self.clinical_classifications.append(ClinicalClassification(clin_class, self))

def __str__(self):
return f'ClinVarRecord object with accession {self.accession}'

def write(self, output):
xml_str = minidom.parseString(ElementTree.tostring(self.rcv)).toprettyxml(indent=' ', encoding='utf-8')
xml_str = minidom.parseString(ElementTree.tostring(self.record_xml)).toprettyxml(indent=' ', encoding='utf-8')
# version 3.8 adds superfluous root
if xml_str.startswith(b'<?xml'):
xml_str = re.sub(b'<\?xml.*?>', b'', xml_str)
Expand All @@ -67,28 +58,30 @@ def write(self, output):

@property
def accession(self):
return find_mandatory_unique_element(self.rcv, './ClinVarAccession').attrib['Acc']
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']

@property
def date(self):
"""This tracks the latest update date, counting even minor technical updates."""
return self.rcv.attrib['DateLastUpdated']
def last_updated_date(self):
"""This tracks the latest update date, counting even minor technical updates.
Appears differently in reference and submitted records."""
raise NotImplementedError

@property
def created_date(self):
"""This tracks the date the record was first made public on ClinVar."""
return self.rcv.attrib['DateCreated']
"""This tracks the date the record was first made public on ClinVar.
Appears differently in reference and submitted records."""
raise NotImplementedError

@property
def mode_of_inheritance(self):
"""Return a (possibly empty) list of modes of inheritance for a given ClinVar record."""
return sorted({
elem.text for elem in find_elements(self.rcv, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
elem.text for elem in find_elements(self.record_xml, './AttributeSet/Attribute[@Type="ModeOfInheritance"]')
})

@property
def trait_set_type(self):
return find_mandatory_unique_element(self.rcv, './TraitSet').attrib['Type']
return find_mandatory_unique_element(self.record_xml, './TraitSet').attrib['Type']

@property
def traits(self):
Expand All @@ -106,17 +99,22 @@ def evidence_support_pubmed_refs(self):
specific disease. These are the references displayed on the ClinVar website in the "Assertion and evidence
details" section at the bottom of the page."""
return [int(elem.text)
for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]
for elem in find_elements(self.record_xml, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]

@property
def allele_origins(self):
return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
return {elem.text for elem in find_elements(self.record_xml, './ObservedIn/Sample/Origin')}

@property
def valid_allele_origins(self):
"""Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}

@cached_property
def clinical_classifications(self):
apriltuesday marked this conversation as resolved.
Show resolved Hide resolved
"""List of clinical classifications (Germline, Somatic, or Oncogenecity)"""
raise NotImplementedError

# The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
# if there is exactly one ClinicalClassification for the record.
# Otherwise these should be taken from the ClinicalClassification objects directly.
Expand Down
41 changes: 41 additions & 0 deletions cmat/clinvar_xml_io/clinvar_reference_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import logging
from functools import cached_property

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification

from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class ClinVarReferenceRecord(ClinVarRecord):
"""Reference records (RCVs) summarise information from submitted records (SCVs) and include additional annotations
and cross-references supplied by ClinVar."""

def __init__(self, record_xml, xsd_version):
super().__init__(record_xml, xsd_version)

def __str__(self):
return f'ClinVarReferenceRecord object with accession {self.accession}'

@property
def last_updated_date(self):
return self.record_xml.attrib['DateLastUpdated']

@property
def created_date(self):
return self.record_xml.attrib['DateCreated']

@cached_property
def clinical_classifications(self):
clinical_classifications = []
if self.xsd_version < 2:
# V1 only ever has a single clinical classification / clinical significance
clinical_classifications.append(
ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
else:
for clin_class in find_elements(self.record_xml, './Classifications/*'):
clinical_classifications.append(ClinicalClassification(clin_class, self))
return clinical_classifications
30 changes: 30 additions & 0 deletions cmat/clinvar_xml_io/clinvar_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements


class ClinVarSet:
"""
A ClinVarSet groups together a single reference record (RCV) and one or more submitted records (SCVs).
"""

def __init__(self, cvs_xml, xsd_version):
self.cvs_xml = cvs_xml

rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
self.rcv = ClinVarRecord(rcv_elem, xsd_version)

scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]

@property
def id(self):
return self.cvs_xml.attrib['ID']

@property
def title(self):
return find_mandatory_unique_element(self.cvs_xml, './Title').text

@property
def status(self):
return find_mandatory_unique_element(self.cvs_xml, './RecordStatus').text
59 changes: 59 additions & 0 deletions cmat/clinvar_xml_io/clinvar_submitted_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import logging
from functools import cached_property

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class ClinVarSubmittedRecord(ClinVarRecord):
"""
Submitted records (SCVs) are structured similarly to reference records (RCVs) with a few exceptions, though they
typically have fewer annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are
added by curators.

SCVs also contain additional information about the actual submission, which we model in this class.
"""

def __init__(self, record_xml, xsd_version, reference_record):
super().__init__(record_xml, xsd_version)
# Each SCV is associated with a single RCV
self.reference_record = reference_record

def __str__(self):
return f'ClinVarSubmittedRecord object with accession {self.accession}'

@property
def submission_date(self):
"""Date of submission or when submission was last revised (for first submission, use created_date)."""
return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate']

@property
def last_updated_date(self):
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated']

@property
def created_date(self):
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated']

@property
def submitter(self):
"""Name of the submitting organization."""
return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter']

@property
def submitter_id(self):
"""Numeric identifier associated with the submitting organization."""
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['OrgID']

@property
def submission_name(self):
"""Name or identifier associated with the submission. This is optional."""
return self.record_xml.attrib.get('SubmissionName', None)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be used for the gene-related condition filtering, confirmed it's present in that case


@cached_property
def clinical_classifications(self):
# Submitted record clinical classifications are defined a bit differently than reference records
raise NotImplementedError('Clinical classification parsing not implemented for SCVs')
16 changes: 10 additions & 6 deletions cmat/clinvar_xml_io/xml_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,22 @@ def parse_header_attributes(clinvar_xml):

def iterate_rcv_from_xml(clinvar_xml):
"""Iterates through the gzipped ClinVar XML and yields complete <ReferenceClinVarAssertion> records."""
for cvs in iterate_cvs_from_xml(clinvar_xml):
# Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
# ClinVar. There should only be one such record per ClinVarSet.
rcv = find_mandatory_unique_element(cvs, 'ReferenceClinVarAssertion')
yield rcv


def iterate_cvs_from_xml(clinvar_xml):
"""Iterates through the gzipped ClinVar XML and yields complete <ClinVarSet> elements."""
with gzip.open(clinvar_xml, 'rt') as fh:
for event, elem in ElementTree.iterparse(fh):
# Wait until we have built a complete ClinVarSet element
if elem.tag != 'ClinVarSet':
continue

# Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
# ClinVar. There should only be one such record per ClinVarSet.
rcv = find_mandatory_unique_element(elem, 'ReferenceClinVarAssertion')

# Return the complete record and then remove the processed element from the tree to save memory
yield rcv
yield elem
elem.clear()


Expand Down
4 changes: 2 additions & 2 deletions cmat/output_generation/annotated_clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ def print_counter(counter):

class AnnotatedClinVarRecord(ClinVarRecord):

def __init__(self, rcv, xsd_version):
super().__init__(rcv, xsd_version, trait_class=OntologyMappedClinVarTrait,
def __init__(self, record_xml, xsd_version):
super().__init__(record_xml, xsd_version, trait_class=OntologyMappedClinVarTrait,
measure_class=EnsemblAnnotatedClinVarMeasure)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ def rcv_to_link(rcv_id):
rcv_id = clinvar_record.accession

# RCV can contain either a MeasureSet, or a GenotypeSet. It must not contain both.
measure_sets = clinvar_record.rcv.findall('MeasureSet')
genotype_sets = clinvar_record.rcv.findall('GenotypeSet')
measure_sets = clinvar_record.record_xml.findall('MeasureSet')
genotype_sets = clinvar_record.record_xml.findall('GenotypeSet')
if len(measure_sets) == 1 and len(genotype_sets) == 0:
# Most common case. RCV directly contains one measure set.
measure_set = measure_sets[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3087,4 +3087,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
14 changes: 1 addition & 13 deletions data-exploration/filter_clinvar_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import xml.etree.ElementTree as ElementTree

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, iterate_cvs_from_xml
from cmat.output_generation.clinvar_to_evidence_strings import get_consequence_types
from cmat.output_generation.consequence_type import process_consequence_type_file

Expand All @@ -18,18 +18,6 @@ def pprint(x):
print(ElementTree.tostring(x, encoding='unicode'))


def iterate_cvs_from_xml(clinvar_xml):
"""Similar to iterate_rcv_from_xml in clinvar_xml_utils, but keeps the entire ClinVarSet XML element.
This allows us to construct a valid ClinVar XML for easy future processing."""
with gzip.open(clinvar_xml, 'rt') as fh:
for event, elem in ElementTree.iterparse(fh):
# Wait until we have built a complete ClinVarSet element
if elem.tag != 'ClinVarSet':
continue
yield elem
elem.clear()


def filter_xml(input_xml, output_xml, filter_fct, max_num=None):
""" Filter input_xml by boolean condition defined by filter_fct and write to output_xml.
If max_num is given, will write at most max_num records, otherwise writes all."""
Expand Down
Loading
Loading