Skip to content

Commit

Permalink
Merge pull request #444 from apriltuesday/add-vcv-id
Browse files Browse the repository at this point in the history
Issue 434: Add VCV IDs to evidence strings
  • Loading branch information
apriltuesday authored Oct 17, 2024
2 parents bd29fe7 + f361ec1 commit b4fdf36
Show file tree
Hide file tree
Showing 14 changed files with 1,110 additions and 1,094 deletions.
2 changes: 1 addition & 1 deletion OT_SCHEMA_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.8.0
2.8.1
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,9 @@ for record in ClinVarDataset('/path/to/clinvar.xml.gz'):
# e.g. RCV001842692: 3_38633214_G_C => Cardiac arrhythmia
print(s)
```

## Citation

If you find CMAT useful, you can cite the following:

> Shen et al., CMAT: ClinVar Mapping and Annotation Toolkit. _Bioinformatics Advances_, 2024. [doi:10.1093/bioadv/vbae018](https://doi.org/10.1093/bioadv/vbae018)
2 changes: 1 addition & 1 deletion bin/cmat/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.3.0
3.3.1
3 changes: 2 additions & 1 deletion cmat/clinvar_xml_io/clinvar_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ class ClinVarRecordMeasure:
MS_REPEAT_EXPANSION = 'repeat_expansion'
MS_NO_COMPLETE_COORDS = 'no_complete_coords'

def __init__(self, measure_xml, clinvar_record):
def __init__(self, measure_xml, clinvar_record, vcv_id):
self.measure_xml = measure_xml
self.clinvar_record = clinvar_record
self.vcv_id = vcv_id

@property
def all_names(self):
Expand Down
7 changes: 6 additions & 1 deletion cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_cl
if not variant_measure:
self.measure = None
else:
self.measure = measure_class(variant_measure, self)
self.measure = measure_class(variant_measure, self, self.vcv_id)

def __str__(self):
return f'ClinVarRecord object with accession {self.accession}'
Expand All @@ -60,6 +60,11 @@ def write(self, output):
def accession(self):
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']

@property
def vcv_id(self):
"""ClinVar's accession for a MeasureSet, present only in RCV records."""
return find_mandatory_unique_element(self.record_xml, './MeasureSet').attrib.get('Acc', None)

@property
def last_updated_date(self):
"""This tracks the latest update date, counting even minor technical updates.
Expand Down
1 change: 1 addition & 0 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
'variantFunctionalConsequenceId': consequence_attributes.so_term.accession,
'variantId': clinvar_record.measure.vcf_full_coords, # CHROM_POS_REF_ALT notation.
'variantRsId': clinvar_record.measure.rs_id,
'variantFromSourceId': clinvar_record.vcv_id,

# PHENOTYPE ATTRIBUTES.
# The alphabetical list of *all* valid disease names from all traits from that ClinVar record, reported as a
Expand Down
4 changes: 2 additions & 2 deletions data-exploration/clinvar-variant-types/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ The data was last updated on **2022-10-24.** Graphs can be enlarged by clicking
## Updating the data

```bash
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz
python3 clinvar-variant-types.py --clinvar-xml ClinVarFullRelease_00-latest.xml.gz
wget -O clinvar.xml.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
python3 clinvar-variant-types.py --clinvar-xml clinvar.xml.gz
```

The source code for diagrams and tables will be printed to STDOUT. The diagrams can then be built using the website http://sankeymatic.com/build/. Parameters for rendering them will be indicated in the output as well. The tables should be copy-pasted into the [corresponding Markdown file](supplementary-tables.md).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import Counter
import logging

from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
from cmat import clinvar_xml_io

logging.basicConfig()
logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def run_pipeline(resource_name, include_transcripts=False):

def test_successful_run():
assert sorted(run_pipeline('precise_genomic.xml.gz')) == sorted([
['NC_000016.10:g.72059151_72063259del', 'ENSG00000140830', 'TXNL4B', 'intron_variant'],
['NC_000016.10:g.72059151_72063259del', 'ENSG00000257017', 'HP', 'stop_lost'],
['NC_000016.10:g.72059151_72063259del', 'ENSG00000261701', 'HPR', 'feature_truncation'],
['NC_000001.11:g.25271785_25329047del', 'ENSG00000117616', 'RSRP1', 'intron_variant'],
Expand All @@ -37,7 +36,7 @@ def test_successful_run():

def test_successful_run_with_transcripts():
results = run_pipeline('precise_genomic.xml.gz', include_transcripts=True)
assert len(results) == 28
assert len(results) == 27
assert ['NC_000001.11:g.25271785_25329047del', 'ENSG00000187010', 'RHD', 'stop_lost', 'ENST00000454452'] in results


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"releaseDate": "2013-04-04",
"studyId": "RCV000002127",
"targetFromSourceId": "ENSG00000139988",
"variantFromSourceId": "VCV000002046",
"variantFunctionalConsequenceId": "SO_0001583",
"variantHgvsId": "NC_000014.9:g.67729209A>G",
"variantId": "14_67729209_A_G",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"releaseDate": "2017-01-13",
"studyId": "RCV000415158",
"targetFromSourceId": "ENSG00000139988",
"variantFromSourceId": "VCV000374196",
"variantFunctionalConsequenceId": "SO_0001583",
"variantHgvsId": "NC_000007.14:g.94423102G>A",
"variantId": "7_94423102_G_A",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"releaseDate": "2013-04-04",
"studyId": "RCV000002127",
"targetFromSourceId": "ENSG00000139988",
"variantFromSourceId": "VCV000002046",
"variantFunctionalConsequenceId": "SO_0001583",
"variantHgvsId": "NC_000014.9:g.67729209A>G",
"variantId": "14_67729209_A_G",
Expand Down
1 change: 1 addition & 0 deletions tests/pipelines/resources/expected/consequences_snp.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@
17:63957460:A:G ENSG00000007314 SCN4A missense_variant
17:63971780:C:T ENSG00000007314 SCN4A missense_variant
17:64485735:TG:T ENSG00000256525 POLG2 frameshift_variant
17:64485735:TG:T ENSG00000271605 MILR1 intron_variant
17:6453135:C:T ENSG00000091622 PITPNM3 3_prime_UTR_variant
17:65177778:A:G ENSG00000108370 RGS9 missense_variant
17:65538242:G:C ENSG00000168646 AXIN2 missense_variant
Expand Down
Loading

0 comments on commit b4fdf36

Please sign in to comment.