Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 416 - Parallelise evidence string generation #421

Merged
merged 5 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions bin/aggregate_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3

import argparse
import glob

from cmat.output_generation.report import Report

parser = argparse.ArgumentParser('Aggregate counts reports')
parser.add_argument('--counts-yml', nargs='+', help='YAML files containing intermediate counts', required=True)


if __name__ == '__main__':
args = parser.parse_args()

# Load all the reports
filenames = [f for files in args.counts_yml for f in glob.glob(files)]
reports = []
for filename in filenames:
r = Report()
r.load_from_file(filename)
reports.append(r)

# Sum them up and output the results
complete_report = sum(reports, start=Report())
complete_report.print_report()
complete_report.dump_to_file(dir_out='.')
complete_report.write_unmapped_terms(dir_out='.')
if not complete_report.check_counts():
raise RuntimeError('Aggregate counts not consistent')
13 changes: 13 additions & 0 deletions bin/count_clinvar_rcvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3

import argparse

from cmat.clinvar_xml_io import ClinVarDataset

parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout')
parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True)


if __name__ == '__main__':
args = parser.parse_args()
print(sum(1 for _ in ClinVarDataset(args.clinvar_xml)))
4 changes: 3 additions & 1 deletion bin/evidence_string_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
parser.add_argument('--ot-schema', help='OpenTargets schema JSON', required=True)
parser.add_argument('--out', help='Output directory', required=True)
parser.add_argument('--start', help='Start index (inclusive)', required=False, type=int)
parser.add_argument('--end', help='End index (exclusive)', required=False, type=int)


if __name__ == '__main__':
args = parser.parse_args()
clinvar_to_evidence_strings.launch_pipeline(
clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
ot_schema_file=args.ot_schema, dir_out=args.out)
ot_schema_file=args.ot_schema, dir_out=args.out, start=args.start, end=args.end)
6 changes: 6 additions & 0 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class ClinVarRecord:

# Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
# Some records have been flagged by ClinVar and should not be used.
INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'}

def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
"""Initialise a ClinVar record object from an RCV XML record."""
Expand Down Expand Up @@ -143,6 +145,10 @@ def clinical_significance_list(self):
See /data-exploration/clinvar-variant-types/README.md for further explanation."""
return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))

@property
def valid_clinical_significances(self):
return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES]

@property
def allele_origins(self):
return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
Expand Down
134 changes: 33 additions & 101 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import re
import sys
import os
from collections import defaultdict, Counter
from collections import defaultdict

import jsonschema

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

logger = logging.getLogger(__package__)

Expand All @@ -22,90 +23,6 @@
# Output file names.
EVIDENCE_STRINGS_FILE_NAME = 'evidence_strings.json'
EVIDENCE_RECORDS_FILE_NAME = 'evidence_records.tsv'
UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'


class Report:
"""Holds counters and other records for a pipeline run."""

def __init__(self, trait_mappings, consequence_mappings):
self.report_strings = []

# The main evidence string counter.
self.evidence_string_count = 0
# Complete evidence strings are ones with an EFO mapping.
self.complete_evidence_string_count = 0

# ClinVar record counters.
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
self.clinvar_skip_invalid_evidence_string = 0
self.clinvar_done_one_complete_evidence_string = 0
self.clinvar_done_multiple_complete_evidence_strings = 0

# Total number of trait-to-ontology mappings present in the database.
self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
# All distinct (trait name, EFO ID) mappings used in the evidence strings.
self.used_trait_mappings = set()
# All unmapped trait names which prevented evidence string generation and their counts.
self.unmapped_trait_names = Counter()

# Variant-to-consequence mapping counts.
self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
self.repeat_expansion_variants = 0
self.structural_variants = 0

def print_report_and_check_counts(self):
"""Print report of counts and return True if counts are consistent, False otherwise."""
# ClinVar tallies.
clinvar_fatal = self.clinvar_fatal_no_valid_traits
clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
clinvar_done = (self.clinvar_done_one_complete_evidence_string +
self.clinvar_done_multiple_complete_evidence_strings)

report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
Total number of complete evidence strings generated\t{self.complete_evidence_string_count}

Total number of ClinVar records\t{self.clinvar_total}
Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
Skipped: Can be rescued by future improvements\t{clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
Done: Generated at least one complete evidence string\t{clinvar_done}
One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
clinvar_done / (clinvar_skipped + clinvar_done):.1%}

Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
The number of distinct trait-to-ontology mappings used in the evidence strings\t{
len(self.used_trait_mappings)}
The number of distinct unmapped trait names which prevented complete evidence string generation\t{
len(self.unmapped_trait_names)}

Total number of variant to consequence mappings\t{self.total_consequence_mappings}
Number of repeat expansion variants\t{self.repeat_expansion_variants}
Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
print(report)

# Confirm counts as expected, exit with error if not.
expected_total = clinvar_fatal + clinvar_skipped + clinvar_done
if expected_total != self.clinvar_total:
logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
return False
return True

def write_unmapped_terms(self, dir_out):
with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')


def validate_evidence_string(ev_string, ot_schema_contents):
Expand All @@ -122,29 +39,38 @@ def validate_evidence_string(ev_string, ot_schema_contents):
sys.exit(1)


def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, start, end):
os.makedirs(dir_out, exist_ok=True)
string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file)
variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)

report, exception_raised = clinvar_to_evidence_strings(
string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
counts_consistent = report.print_report_and_check_counts()
report.write_unmapped_terms(dir_out)
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end)
counts_consistent = report.check_counts()
report.print_report()
report.dump_to_file(dir_out)
if exception_raised or not counts_consistent:
sys.exit(1)


def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
output_evidence_strings):
output_evidence_strings, start=None, end=None):
report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
ot_schema_contents = json.loads(open(ot_schema).read())
output_evidence_strings_file = open(output_evidence_strings, 'wt')
exception_raised = False

logger.info('Processing ClinVar records')
i = -1
for clinvar_record in ClinVarDataset(clinvar_xml):
# If start & end provided, only process records in the range [start, end)
i += 1
if start and i < start:
continue
if end and i >= end:
break

report.clinvar_total += 1
if report.clinvar_total % 1000 == 0:
logger.info(f'{report.clinvar_total} records processed')
Expand All @@ -156,20 +82,25 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue
# Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# submissions being flagged.
if not clinvar_record.valid_clinical_significances:
report.clinvar_fatal_no_clinical_significance += 1
continue

# Failure mode 2 (skip). A ClinVar record contains an unsupported variation type.
# Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue

# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
# origins, (2) EFO mappings, and (3) genes where the variant has effect.
# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid
# allele origins, (2) EFO mappings, and (3) genes where the variant has effect.
grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

# Failure mode 3 (skip). No functional consequences are available.
# Failure mode 4 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue
Expand All @@ -180,9 +111,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable
# EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a
# failure so we can continue to measure mapping coverage.
# Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# suitable EFO mappings were found in the database. This will still generate an evidence string, but is
# tracked as a failure so we can continue to measure mapping coverage.
if not any(group[-1] for group in grouped_diseases):
report.clinvar_skip_missing_efo_mapping += 1
unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name
Expand All @@ -196,8 +127,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
for allele_origins, disease_attributes, consequence_attributes in itertools.product(
grouped_allele_origins, grouped_diseases, consequence_types):
disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
disease_mapped_efo_id, consequence_attributes)
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name,
disease_source_id, disease_mapped_efo_id,
consequence_attributes)

# Validate and immediately output the evidence string (not keeping everything in memory).
is_valid = validate_evidence_string(evidence_string, ot_schema_contents)
Expand All @@ -208,7 +140,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
evidence_strings_generated += 1
if disease_mapped_efo_id is not None:
complete_evidence_strings_generated += 1
report.used_trait_mappings.add((disease_name, disease_mapped_efo_id))
report.used_trait_mappings.add(disease_mapped_efo_id)

if complete_evidence_strings_generated == 1:
report.clinvar_done_one_complete_evidence_string += 1
Expand Down Expand Up @@ -259,7 +191,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
'allelicRequirements': clinvar_record.mode_of_inheritance,

# Levels of clinical significance reported for the variant.
'clinicalSignificances': clinvar_record.clinical_significance_list,
'clinicalSignificances': clinvar_record.valid_clinical_significances,

# Confidence (review status).
'confidence': clinvar_record.review_status,
Expand Down
Loading
Loading