Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 434: Add VCV IDs to evidence strings #444

Merged
merged 6 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion OT_SCHEMA_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.8.0
2.8.1
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,9 @@ for record in ClinVarDataset('/path/to/clinvar.xml.gz'):
# e.g. RCV001842692: 3_38633214_G_C => Cardiac arrhythmia
print(s)
```

## Citation

If you find CMAT useful, you can cite the following:

> Shen et al., CMAT: ClinVar Mapping and Annotation Toolkit. _Bioinformatics Advances_, 2024. [doi:10.1093/bioadv/vbae018](https://doi.org/10.1093/bioadv/vbae018)
2 changes: 1 addition & 1 deletion bin/cmat/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.3.0
3.3.1
3 changes: 2 additions & 1 deletion cmat/clinvar_xml_io/clinvar_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ class ClinVarRecordMeasure:
MS_REPEAT_EXPANSION = 'repeat_expansion'
MS_NO_COMPLETE_COORDS = 'no_complete_coords'

def __init__(self, measure_xml, clinvar_record):
def __init__(self, measure_xml, clinvar_record, vcv_id):
self.measure_xml = measure_xml
self.clinvar_record = clinvar_record
self.vcv_id = vcv_id

@property
def all_names(self):
Expand Down
7 changes: 6 additions & 1 deletion cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_cl
if not variant_measure:
self.measure = None
else:
self.measure = measure_class(variant_measure, self)
self.measure = measure_class(variant_measure, self, self.vcv_id)

def __str__(self):
return f'ClinVarRecord object with accession {self.accession}'
Expand All @@ -60,6 +60,11 @@ def write(self, output):
def accession(self):
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']

@property
def vcv_id(self):
"""ClinVar's accession for a MeasureSet, present only in RCV records."""
return find_mandatory_unique_element(self.record_xml, './MeasureSet').attrib.get('Acc', None)

@property
def last_updated_date(self):
"""This tracks the latest update date, counting even minor technical updates.
Expand Down
1 change: 1 addition & 0 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
'variantFunctionalConsequenceId': consequence_attributes.so_term.accession,
'variantId': clinvar_record.measure.vcf_full_coords, # CHROM_POS_REF_ALT notation.
'variantRsId': clinvar_record.measure.rs_id,
'variantFromSourceId': clinvar_record.vcv_id,

# PHENOTYPE ATTRIBUTES.
# The alphabetical list of *all* valid disease names from all traits from that ClinVar record, reported as a
Expand Down
4 changes: 2 additions & 2 deletions data-exploration/clinvar-variant-types/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ The data was last updated on **2022-10-24.** Graphs can be enlarged by clicking
## Updating the data

```bash
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz
python3 clinvar-variant-types.py --clinvar-xml ClinVarFullRelease_00-latest.xml.gz
wget -O clinvar.xml.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_release/ClinVarRCVRelease_00-latest.xml.gz
python3 clinvar-variant-types.py --clinvar-xml clinvar.xml.gz
```

The source code for diagrams and tables will be printed to STDOUT. The diagrams can then be built using the website http://sankeymatic.com/build/. Parameters for rendering them will be indicated in the output as well. The tables should be copy-pasted into the [corresponding Markdown file](supplementary-tables.md).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import Counter
import logging

from eva_cttv_pipeline.clinvar_xml_io import clinvar_xml_io
from cmat import clinvar_xml_io

logging.basicConfig()
logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def run_pipeline(resource_name, include_transcripts=False):

def test_successful_run():
assert sorted(run_pipeline('precise_genomic.xml.gz')) == sorted([
['NC_000016.10:g.72059151_72063259del', 'ENSG00000140830', 'TXNL4B', 'intron_variant'],
['NC_000016.10:g.72059151_72063259del', 'ENSG00000257017', 'HP', 'stop_lost'],
['NC_000016.10:g.72059151_72063259del', 'ENSG00000261701', 'HPR', 'feature_truncation'],
['NC_000001.11:g.25271785_25329047del', 'ENSG00000117616', 'RSRP1', 'intron_variant'],
Expand All @@ -37,7 +36,7 @@ def test_successful_run():

def test_successful_run_with_transcripts():
results = run_pipeline('precise_genomic.xml.gz', include_transcripts=True)
assert len(results) == 28
assert len(results) == 27
assert ['NC_000001.11:g.25271785_25329047del', 'ENSG00000187010', 'RHD', 'stop_lost', 'ENST00000454452'] in results


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"releaseDate": "2013-04-04",
"studyId": "RCV000002127",
"targetFromSourceId": "ENSG00000139988",
"variantFromSourceId": "VCV000002046",
"variantFunctionalConsequenceId": "SO_0001583",
"variantHgvsId": "NC_000014.9:g.67729209A>G",
"variantId": "14_67729209_A_G",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"releaseDate": "2017-01-13",
"studyId": "RCV000415158",
"targetFromSourceId": "ENSG00000139988",
"variantFromSourceId": "VCV000374196",
"variantFunctionalConsequenceId": "SO_0001583",
"variantHgvsId": "NC_000007.14:g.94423102G>A",
"variantId": "7_94423102_G_A",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"releaseDate": "2013-04-04",
"studyId": "RCV000002127",
"targetFromSourceId": "ENSG00000139988",
"variantFromSourceId": "VCV000002046",
"variantFunctionalConsequenceId": "SO_0001583",
"variantHgvsId": "NC_000014.9:g.67729209A>G",
"variantId": "14_67729209_A_G",
Expand Down
1 change: 1 addition & 0 deletions tests/pipelines/resources/expected/consequences_snp.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@
17:63957460:A:G ENSG00000007314 SCN4A missense_variant
17:63971780:C:T ENSG00000007314 SCN4A missense_variant
17:64485735:TG:T ENSG00000256525 POLG2 frameshift_variant
17:64485735:TG:T ENSG00000271605 MILR1 intron_variant
17:6453135:C:T ENSG00000091622 PITPNM3 3_prime_UTR_variant
17:65177778:A:G ENSG00000108370 RGS9 missense_variant
17:65538242:G:C ENSG00000168646 AXIN2 missense_variant
Expand Down
Loading
Loading