Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 416 - Parallelise evidence string generation #421

Merged
merged 5 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions bin/aggregate_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3

import argparse
import glob

from cmat.output_generation.report import Report

parser = argparse.ArgumentParser('Aggregate counts reports')
parser.add_argument('--counts-yml', nargs='+', help='YAML files containing intermediate counts', required=True)


if __name__ == '__main__':
args = parser.parse_args()

# Load all the reports
filenames = [f for files in args.counts_yml for f in glob.glob(files)]
reports = []
for filename in filenames:
r = Report()
r.load_from_file(filename)
reports.append(r)

# Sum them up and output the results
complete_report = sum(reports, start=Report())
complete_report.print_report()
complete_report.dump_to_file(dir_out='.')
complete_report.write_unmapped_terms(dir_out='.')
if not complete_report.check_counts():
raise RuntimeError('Aggregate counts not consistent')
13 changes: 13 additions & 0 deletions bin/count_clinvar_rcvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3

import argparse

from cmat.clinvar_xml_io import ClinVarDataset

parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout')
parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True)


if __name__ == '__main__':
args = parser.parse_args()
print(sum(1 for _ in ClinVarDataset(args.clinvar_xml)))
4 changes: 3 additions & 1 deletion bin/evidence_string_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
parser.add_argument('--ot-schema', help='OpenTargets schema JSON', required=True)
parser.add_argument('--out', help='Output directory', required=True)
parser.add_argument('--start', help='Start index (inclusive)', required=False, type=int)
parser.add_argument('--end', help='End index (exclusive)', required=False, type=int)


if __name__ == '__main__':
args = parser.parse_args()
clinvar_to_evidence_strings.launch_pipeline(
clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
ot_schema_file=args.ot_schema, dir_out=args.out)
ot_schema_file=args.ot_schema, dir_out=args.out, start=args.start, end=args.end)
108 changes: 17 additions & 91 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import re
import sys
import os
from collections import defaultdict, Counter
from collections import defaultdict

import jsonschema

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

logger = logging.getLogger(__package__)

Expand All @@ -22,90 +23,6 @@
# Output file names.
EVIDENCE_STRINGS_FILE_NAME = 'evidence_strings.json'
EVIDENCE_RECORDS_FILE_NAME = 'evidence_records.tsv'
UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'


class Report:
"""Holds counters and other records for a pipeline run."""

def __init__(self, trait_mappings, consequence_mappings):
self.report_strings = []

# The main evidence string counter.
self.evidence_string_count = 0
# Complete evidence strings are ones with an EFO mapping.
self.complete_evidence_string_count = 0

# ClinVar record counters.
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
self.clinvar_skip_invalid_evidence_string = 0
self.clinvar_done_one_complete_evidence_string = 0
self.clinvar_done_multiple_complete_evidence_strings = 0

# Total number of trait-to-ontology mappings present in the database.
self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
# All distinct (trait name, EFO ID) mappings used in the evidence strings.
self.used_trait_mappings = set()
# All unmapped trait names which prevented evidence string generation and their counts.
self.unmapped_trait_names = Counter()

# Variant-to-consequence mapping counts.
self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
self.repeat_expansion_variants = 0
self.structural_variants = 0

def print_report_and_check_counts(self):
"""Print report of counts and return True if counts are consistent, False otherwise."""
# ClinVar tallies.
clinvar_fatal = self.clinvar_fatal_no_valid_traits
clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
clinvar_done = (self.clinvar_done_one_complete_evidence_string +
self.clinvar_done_multiple_complete_evidence_strings)

report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
Total number of complete evidence strings generated\t{self.complete_evidence_string_count}

Total number of ClinVar records\t{self.clinvar_total}
Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
Skipped: Can be rescued by future improvements\t{clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
Done: Generated at least one complete evidence string\t{clinvar_done}
One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
clinvar_done / (clinvar_skipped + clinvar_done):.1%}

Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
The number of distinct trait-to-ontology mappings used in the evidence strings\t{
len(self.used_trait_mappings)}
The number of distinct unmapped trait names which prevented complete evidence string generation\t{
len(self.unmapped_trait_names)}

Total number of variant to consequence mappings\t{self.total_consequence_mappings}
Number of repeat expansion variants\t{self.repeat_expansion_variants}
Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
print(report)

# Confirm counts as expected, exit with error if not.
expected_total = clinvar_fatal + clinvar_skipped + clinvar_done
if expected_total != self.clinvar_total:
logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
return False
return True

def write_unmapped_terms(self, dir_out):
with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')


def validate_evidence_string(ev_string, ot_schema_contents):
Expand All @@ -122,29 +39,38 @@ def validate_evidence_string(ev_string, ot_schema_contents):
sys.exit(1)


def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, start, end):
os.makedirs(dir_out, exist_ok=True)
string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file)
variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)

report, exception_raised = clinvar_to_evidence_strings(
string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
counts_consistent = report.print_report_and_check_counts()
report.write_unmapped_terms(dir_out)
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end)
counts_consistent = report.check_counts()
report.print_report()
report.dump_to_file(dir_out)
if exception_raised or not counts_consistent:
sys.exit(1)


def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
output_evidence_strings):
output_evidence_strings, start=None, end=None):
report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
ot_schema_contents = json.loads(open(ot_schema).read())
output_evidence_strings_file = open(output_evidence_strings, 'wt')
exception_raised = False

logger.info('Processing ClinVar records')
i = -1
for clinvar_record in ClinVarDataset(clinvar_xml):
# If start & end provided, only process records in the range [start, end)
i += 1
if start and i < start:
continue
if end and i >= end:
break

report.clinvar_total += 1
if report.clinvar_total % 1000 == 0:
logger.info(f'{report.clinvar_total} records processed')
Expand Down Expand Up @@ -208,7 +134,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
evidence_strings_generated += 1
if disease_mapped_efo_id is not None:
complete_evidence_strings_generated += 1
report.used_trait_mappings.add((disease_name, disease_mapped_efo_id))
report.used_trait_mappings.add(disease_mapped_efo_id)

if complete_evidence_strings_generated == 1:
report.clinvar_done_one_complete_evidence_string += 1
Expand Down
138 changes: 138 additions & 0 deletions cmat/output_generation/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import logging
import os
from collections import Counter

import yaml

logger = logging.getLogger(__package__)

# Add representers to enable safe-dump
yaml.SafeDumper.add_representer(Counter,
lambda dumper, data: dumper.represent_mapping('tag:yaml.org,2002:map', data.items()))

UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'
COUNTS_FILE_NAME = 'counts.yml'


class Report:
"""Holds counters and other records for a pipeline run."""

def __init__(self, trait_mappings=None, consequence_mappings=None):
# The main evidence string counter.
self.evidence_string_count = 0
# Complete evidence strings are ones with an EFO mapping.
self.complete_evidence_string_count = 0

# ClinVar record counters.
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
self.clinvar_skip_invalid_evidence_string = 0
self.clinvar_done_one_complete_evidence_string = 0
self.clinvar_done_multiple_complete_evidence_strings = 0
self.clinvar_fatal = 0
self.clinvar_skipped = 0
self.clinvar_done = 0

# Total number of trait-to-ontology mappings present in the database.
self.total_trait_mappings = 0
if trait_mappings:
self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
# All distinct (trait name, EFO ID) mappings used in the evidence strings.
self.used_trait_mappings = set()
# All unmapped trait names which prevented evidence string generation and their counts.
self.unmapped_trait_names = Counter()

# Variant-to-consequence mapping counts.
self.total_consequence_mappings = 0
if consequence_mappings:
self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
self.repeat_expansion_variants = 0
self.structural_variants = 0

def __eq__(self, other):
if not isinstance(other, Report):
return NotImplemented
return self.__dict__ == other.__dict__

def __add__(self, other):
if not isinstance(other, Report):
return NotImplemented
result = Report()
for var_name in vars(self).keys() | vars(other).keys():
if var_name == 'total_trait_mappings':
result.total_trait_mappings = max(self.total_trait_mappings, other.total_trait_mappings)
if var_name == 'total_consequence_mappings':
result.total_consequence_mappings = max(self.total_consequence_mappings,
other.total_consequence_mappings)
if var_name == 'used_trait_mappings':
result.used_trait_mappings = self.used_trait_mappings | other.used_trait_mappings
else:
result.__setattr__(var_name, self.__getattribute__(var_name) + other.__getattribute__(var_name))
return result

def dump_to_file(self, dir_out, filename=COUNTS_FILE_NAME):
with open(os.path.join(dir_out, filename), 'w') as f:
yaml.safe_dump(vars(self), f)

def load_from_file(self, filename):
with open(filename, 'r') as f:
data = yaml.safe_load(f)
self.__dict__.update(**data)
# yaml loads a dict, convert to counter
self.unmapped_trait_names = Counter(self.unmapped_trait_names)
Comment on lines +77 to +86
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to say that pickling the objet might be safer but I 'm guessing that you want to use the final output yaml elsewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I was thinking having it in yaml might be more readable and flexible for downstream purposes (e.g. updating the metrics or generating sankey diagrams), but it's probably overkill if those processes end up implemented in Python anyway... I'd probably keep it as it is for now.


def compute_record_tallies(self):
"""Compute tallies of records fatal/skipped/done based on the more granular counts."""
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits
self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
self.clinvar_done = (self.clinvar_done_one_complete_evidence_string +
self.clinvar_done_multiple_complete_evidence_strings)

def check_counts(self):
"""Return True if counts are consistent, False otherwise."""
self.compute_record_tallies()
expected_total = self.clinvar_fatal + self.clinvar_skipped + self.clinvar_done
if expected_total != self.clinvar_total:
logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
return False
return True

def print_report(self):
"""Print report of counts."""
self.compute_record_tallies()
report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
Total number of complete evidence strings generated\t{self.complete_evidence_string_count}

Total number of ClinVar records\t{self.clinvar_total}
Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
Done: Generated at least one complete evidence string\t{self.clinvar_done}
One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
self.clinvar_done / (self.clinvar_skipped + self.clinvar_done):.1%}

Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
The number of distinct trait-to-ontology mappings used in the evidence strings\t{
len(self.used_trait_mappings)}
The number of distinct unmapped trait names which prevented complete evidence string generation\t{
len(self.unmapped_trait_names)}

Total number of variant to consequence mappings\t{self.total_consequence_mappings}
Number of repeat expansion variants\t{self.repeat_expansion_variants}
Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
print(report)

def write_unmapped_terms(self, dir_out):
with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')
Loading
Loading