Skip to content

Commit

Permalink
Merge pull request #432 from apriltuesday/new-xml
Browse files Browse the repository at this point in the history
Issue 396 - Migrate to ClinVar XML V2
  • Loading branch information
apriltuesday authored Jun 4, 2024
2 parents 10ca8b3 + 0d51f7a commit 21e78db
Show file tree
Hide file tree
Showing 36 changed files with 1,290 additions and 1,238 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
```
You can use the `--include_transcripts` flag to also include transcript annotations with the functional consequences.

By default, the pipeline will download and annotate the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
By default, the pipeline will download and annotate the latest ClinVar RCV XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.

### Trait curation

Expand Down Expand Up @@ -125,9 +125,9 @@ nextflow run ${CODE_ROOT}/pipelines/generate_curation_spreadsheet.nf \
-resume
```

By default, the pipeline will download and map the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
By default, the pipeline will download and map the latest ClinVar RCV XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.

To create the curation spreadsheet, first make your own copy of the [template](https://docs.google.com/spreadsheets/d/1PyDzRs3bO1klvvSv9XuHmx-x7nqZ0UAGeS6aV2SQ2Yg/edit?usp=sharing).
To create the curation spreadsheet, first make your own copy of the [template](https://docs.google.com/spreadsheets/d/1GWAQAZjOpzsIkdCu0CSRDoehZEUB3VjZYYiHWp9Tn7Q/edit?usp=sharing).
Then paste the contents of `${CURATION_ROOT}/google_sheets_table.tsv` into it, starting with column H “ClinVar label”.

#### Manual curation
Expand Down
2 changes: 1 addition & 1 deletion bin/cmat/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.1.3
3.2.0.dev0
82 changes: 82 additions & 0 deletions cmat/clinvar_xml_io/clinical_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import re

from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_optional_unique_element


class MultipleClinicalClassificationsError(NotImplementedError):
# Raised when we encounter multiples of clinical classifications or their attributes when not expected.
# This is new as of ClinVar XSD V2 and will be supported at some point in the future.
pass


class ClinicalClassification:

# A score for the review status of the assigned clinical significance ranges from 0 to 4 and corresponds to the
# number of "gold stars" displayed on ClinVar website. See details here:
# https://www.ncbi.nlm.nih.gov/clinvar/docs/details/#review_status
score_map = {
'no assertion provided': 0, # v1 only
'no classification provided': 0,
'no classification for the single variant': 0,
'no assertion criteria provided': 0,
'no classifications from unflagged records': 0,
'criteria provided, single submitter': 1,
'criteria provided, conflicting interpretations': 1, # v1 only
'criteria provided, conflicting classifications': 1,
'criteria provided, multiple submitters, no conflicts': 2,
'reviewed by expert panel': 3,
'practice guideline': 4,
}

# Some records have been flagged by ClinVar and should not be used.
INVALID_CLINICAL_SIGNIFICANCES = {'no classifications from unflagged records'}

def __init__(self, class_xml, clinvar_record):
self.class_xml = class_xml
self.clinvar_record = clinvar_record
self.xsd_version = clinvar_record.xsd_version
# Type of clinical classification: germline, somatic, or oncogenicity
self.type = class_xml.tag

@property
def last_evaluated_date(self):
"""This tracks the latest (re)evaluation date for the clinical interpretation.
See https://github.com/opentargets/platform/issues/1161#issuecomment-683938510 for details."""
if self.xsd_version < 2:
# The DateLastEvaluated attribute is not always present. In this case, this property will be None.
return self.class_xml.attrib.get('DateLastEvaluated')
return find_optional_unique_element(self.class_xml, './DateLastEvaluated')

@property
def review_status(self):
"""Return a review status text for the assigned clinical significance. See score_map above for the list of
possible values."""
review_status = find_mandatory_unique_element(self.class_xml, './ReviewStatus').text
assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}'
return review_status

@property
def score(self):
"""Return a score (star rating) for the assigned clinical significance. See score_map above."""
return self.score_map[self.review_status]

@property
def clinical_significance_raw(self):
"""The original clinical significance string as stored in ClinVar. Example: 'Benign/Likely benign'."""
try:
return find_mandatory_unique_element(self.class_xml, './Description').text
except AssertionError as e:
raise MultipleClinicalClassificationsError(f'Found multiple descriptions for one ClinicalClassification in '
f'{self.clinvar_record.accession}')

@property
def clinical_significance_list(self):
"""The normalised deduplicated list of all clinical significance values. The original value is (1) split into
multiple values by 3 delimiters: ('/', ', ', '; '), (2) converted into lowercase and (3) sorted
lexicographically. Example: 'Benign/Likely benign, risk_factor' → ['benign', 'likely benign', 'risk factor'].
See /data-exploration/clinvar-variant-types/README.md for further explanation."""
return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))

@property
def valid_clinical_significances(self):
return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNIFICANCES]
14 changes: 13 additions & 1 deletion cmat/clinvar_xml_io/clinvar_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gzip
import logging
import re
from datetime import date

from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
Expand All @@ -15,10 +16,21 @@ def __init__(self, clinvar_xml):
self.clinvar_xml = clinvar_xml
self.header_attr = parse_header_attributes(clinvar_xml)
self.header_attr['LastProcessed'] = date.today().strftime('%Y-%m-%d')
self.xsd_version = self.get_xsd_version()

def __iter__(self):
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
yield ClinVarRecord(rcv)
yield ClinVarRecord(rcv, self.xsd_version)

def get_xsd_version(self):
# For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md
if 'xsi:noNamespaceSchemaLocation' in self.header_attr:
url = self.header_attr['xsi:noNamespaceSchemaLocation']
m = re.search(r'(ClinVar_RCV|clinvar_public)_([0-9.]+)\.xsd', url, flags=re.IGNORECASE)
if m and m.group(2):
return float(m.group(2))
# If we cannot parse, assume v2
return 2.0

def write_header(self, output_file):
header = b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<ReleaseSet'
Expand Down
99 changes: 46 additions & 53 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import xml.etree.ElementTree as ElementTree
from xml.dom import minidom

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure
from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait
from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \
Expand All @@ -18,28 +19,13 @@ class ClinVarRecord:
* Issue https://github.com/EBIvariation/eva-opentargets/issues/127 for the most recent discussions on changing
support of different ClinVar record types."""

# A score for the review status of the assigned clinical significance ranges from 0 to 4 and corresponds to the
# number of "gold stars" displayed on ClinVar website. See details here:
# https://www.ncbi.nlm.nih.gov/clinvar/docs/details/#review_status
score_map = {
"no assertion provided": 0,
'no assertion criteria provided': 0,
'no classifications from unflagged records': 0,
'criteria provided, single submitter': 1,
'criteria provided, conflicting interpretations': 1,
'criteria provided, multiple submitters, no conflicts': 2,
'reviewed by expert panel': 3,
'practice guideline': 4,
}

# Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
# Some records have been flagged by ClinVar and should not be used.
INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'}

def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
def __init__(self, rcv, xsd_version, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
"""Initialise a ClinVar record object from an RCV XML record."""
self.rcv = rcv
self.xsd_version = xsd_version

# Add a list of traits
self.trait_set = []
Expand All @@ -57,6 +43,16 @@ def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMea
else:
self.measure = measure_class(variant_measure, self)

# List of clinical classifications (Germline, Somatic, or Oncogenecity
self.clinical_classifications = []
if self.xsd_version < 2:
# V1 only ever has a single clinical classification / clinical significance
self.clinical_classifications.append(
ClinicalClassification(find_mandatory_unique_element(self.rcv, './ClinicalSignificance'), self))
else:
for clin_class in find_elements(self.rcv, './Classifications/*'):
self.clinical_classifications.append(ClinicalClassification(clin_class, self))

def __str__(self):
return f'ClinVarRecord object with accession {self.accession}'

Expand All @@ -83,26 +79,6 @@ def created_date(self):
"""This tracks the date the record was first made public on ClinVar."""
return self.rcv.attrib['DateCreated']

@property
def last_evaluated_date(self):
"""This tracks the latest (re)evaluation date for the clinical interpretation.
See https://github.com/opentargets/platform/issues/1161#issuecomment-683938510 for details."""
# The DateLastEvaluated attribute is not always present. In this case, this property will be None.
return find_mandatory_unique_element(self.rcv, './ClinicalSignificance').attrib.get('DateLastEvaluated')

@property
def review_status(self):
"""Return a review status text for the assigned clinical significance. See score_map above for the list of
possible values."""
review_status = find_mandatory_unique_element(self.rcv, './ClinicalSignificance/ReviewStatus').text
assert review_status in self.score_map, f'Unknown review status {review_status} in RCV {self.accession}'
return review_status

@property
def score(self):
"""Return a score (star rating) for the assigned clinical significance. See score_map above."""
return self.score_map[self.review_status]

@property
def mode_of_inheritance(self):
"""Return a (possibly empty) list of modes of inheritance for a given ClinVar record."""
Expand Down Expand Up @@ -133,27 +109,44 @@ def evidence_support_pubmed_refs(self):
for elem in find_elements(self.rcv, './ObservedIn/ObservedData/Citation/ID[@Source="PubMed"]')]

@property
def clinical_significance_raw(self):
"""The original clinical significance string as stored in ClinVar. Example: 'Benign/Likely benign'."""
return find_mandatory_unique_element(self.rcv, './ClinicalSignificance/Description').text
def allele_origins(self):
return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}

@property
def clinical_significance_list(self):
"""The normalised deduplicated list of all clinical significance values. The original value is (1) split into
multiple values by 3 delimiters: ('/', ', ', '; '), (2) converted into lowercase and (3) sorted
lexicographically. Example: 'Benign/Likely benign, risk_factor' → ['benign', 'likely benign', 'risk factor'].
See /data-exploration/clinvar-variant-types/README.md for further explanation."""
return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))
def valid_allele_origins(self):
"""Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}

# The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
# if there is exactly one ClinicalClassification for the record.
# Otherwise these should be taken from the ClinicalClassification objects directly.

@property
def valid_clinical_significances(self):
return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES]
def last_evaluated_date(self):
if len(self.clinical_classifications) > 1:
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].last_evaluated_date

@property
def allele_origins(self):
return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
def review_status(self):
if len(self.clinical_classifications) > 1:
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].review_status

@property
def valid_allele_origins(self):
"""Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}
def score(self):
if len(self.clinical_classifications) > 1:
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].score

@property
def clinical_significance_list(self):
if len(self.clinical_classifications) > 1:
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].clinical_significance_list

@property
def valid_clinical_significances(self):
if len(self.clinical_classifications) > 1:
raise MultipleClinicalClassificationsError(f'Found multiple ClinicalClassifications for {self.accession}')
return self.clinical_classifications[0].valid_clinical_significances
2 changes: 1 addition & 1 deletion cmat/clinvar_xml_io/xml_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def parse_header_attributes(clinvar_xml):
"""Parses out attributes to the root-level ReleaseSet element and returns them as a dict."""
attrib = None
with gzip.open(clinvar_xml, 'rt') as fh:
for event, elem in ElementTree.iterparse(fh):
for event, elem in ElementTree.iterparse(fh, events=['start']):
if elem.tag == 'ReleaseSet':
attrib = elem.attrib
break
Expand Down
7 changes: 4 additions & 3 deletions cmat/output_generation/annotated_clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __iter__(self):
self.mismatches_file.write('RCV\tCV\tCMAT\n')

for rcv in iterate_rcv_from_xml(self.clinvar_xml):
record = AnnotatedClinVarRecord(rcv)
record = AnnotatedClinVarRecord(rcv, self.xsd_version)
self.annotate(record)
yield record

Expand Down Expand Up @@ -213,8 +213,9 @@ def print_counter(counter):

class AnnotatedClinVarRecord(ClinVarRecord):

def __init__(self, rcv):
super().__init__(rcv, trait_class=OntologyMappedClinVarTrait, measure_class=EnsemblAnnotatedClinVarMeasure)
def __init__(self, rcv, xsd_version):
super().__init__(rcv, xsd_version, trait_class=OntologyMappedClinVarTrait,
measure_class=EnsemblAnnotatedClinVarMeasure)


class OntologyMappedClinVarTrait(ClinVarTrait):
Expand Down
15 changes: 15 additions & 0 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import jsonschema

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

Expand Down Expand Up @@ -77,6 +78,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

# Catch any exceptions for this record so we can continue processing.
try:
# Failure mode 0 (skip). Contains multiple clinical classification annotations.
# This is new as of V2 of the ClinVar XSD and should definitely be supported at some point,
# but as it can cause parsing complications we catch these cases first.
# See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396
if len(clinvar_record.clinical_classifications) > 1:
logger.warning(f'Found multiple clinical classifications in record {clinvar_record.accession}')
report.clinvar_skip_multiple_clinical_classifications += 1
continue

# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# potentially mappable name).
if not clinvar_record.traits_with_valid_names:
Expand Down Expand Up @@ -153,6 +163,11 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
report.complete_evidence_string_count += complete_evidence_strings_generated
report.evidence_string_count += evidence_strings_generated

except MultipleClinicalClassificationsError as mcce:
# Ensure we catch any of these that fall through (e.g. from multiple description text)
logger.error(str(mcce))
report.clinvar_skip_multiple_clinical_classifications += 1

except Exception as e:
# We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
# records and printing the report.
Expand Down
Loading

0 comments on commit 21e78db

Please sign in to comment.