From 32b8d54adf904d2d780c652477014b88f99b8aea Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 29 Feb 2024 15:32:11 +0000 Subject: [PATCH 1/5] chunk and parallelise evidence string generation --- bin/count_clinvar_rcvs.py | 13 ++++ bin/evidence_string_generation.py | 4 +- .../clinvar_to_evidence_strings.py | 15 +++- pipelines/annotation_pipeline.nf | 74 ++++++++++++++++--- .../test_clinvar_to_evidence_strings.py | 1 - tests/pipelines/test_annotation_pipeline.sh | 3 +- 6 files changed, 92 insertions(+), 18 deletions(-) create mode 100644 bin/count_clinvar_rcvs.py diff --git a/bin/count_clinvar_rcvs.py b/bin/count_clinvar_rcvs.py new file mode 100644 index 00000000..13b34ae5 --- /dev/null +++ b/bin/count_clinvar_rcvs.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import argparse + +from cmat.clinvar_xml_io import ClinVarDataset + +parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout') +parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True) + + +if __name__ == '__main__': + args = parser.parse_args() + print(sum(1 for _ in ClinVarDataset(args.clinvar_xml))) diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py index 6b6aabf4..28893942 100755 --- a/bin/evidence_string_generation.py +++ b/bin/evidence_string_generation.py @@ -9,10 +9,12 @@ parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True) parser.add_argument('--ot-schema', help='OpenTargets schema JSON', required=True) parser.add_argument('--out', help='Output directory', required=True) +parser.add_argument('--start', help='Start index (inclusive)', required=False, type=int) +parser.add_argument('--end', help='End index (exclusive)', required=False, type=int) if __name__ == '__main__': args = parser.parse_args() clinvar_to_evidence_strings.launch_pipeline( clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping, - ot_schema_file=args.ot_schema, dir_out=args.out) + ot_schema_file=args.ot_schema, dir_out=args.out, start=args.start, end=args.end) diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 74ffd48f..06ea3e0c 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -60,6 +60,7 @@ def __init__(self, trait_mappings, consequence_mappings): def print_report_and_check_counts(self): """Print report of counts and return True if counts are consistent, False otherwise.""" + # TODO dump to a file so counts can be aggregated # ClinVar tallies. clinvar_fatal = self.clinvar_fatal_no_valid_traits clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + @@ -122,14 +123,14 @@ def validate_evidence_string(ev_string, ot_schema_contents): sys.exit(1) -def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out): +def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, start, end): os.makedirs(dir_out, exist_ok=True) string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file) variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file) report, exception_raised = clinvar_to_evidence_strings( string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file, - output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME)) + output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end) counts_consistent = report.print_report_and_check_counts() report.write_unmapped_terms(dir_out) if exception_raised or not counts_consistent: @@ -137,14 +138,22 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema, - output_evidence_strings): + output_evidence_strings, start=None, end=None): report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = open(output_evidence_strings, 'wt') exception_raised = False logger.info('Processing ClinVar records') + i = -1 for clinvar_record in ClinVarDataset(clinvar_xml): + # If start & end provided, only process records in the range [start, end) + i += 1 + if start and i < start: + continue + if end and i >= end: + break + report.clinvar_total += 1 if report.clinvar_total % 1000 == 0: logger.info(f'{report.clinvar_total} records processed') diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf index c7b5411b..e65519d5 100644 --- a/pipelines/annotation_pipeline.nf +++ b/pipelines/annotation_pipeline.nf @@ -34,6 +34,8 @@ if (!params.output_dir) { batchRoot = params.output_dir codeRoot = "${projectDir}/.." includeTranscriptsFlag = params.include_transcripts ? "--include-transcripts" : "" +// Number of chunks to run in parallel for evidence generation +numChunks = 10 /* * Main workflow. @@ -56,11 +58,23 @@ workflow { if (params.schema != null) { // Open Targets evidence string output downloadJsonSchema() + // Get start/end indices to break XML into chunks + countClinvarRecords(clinvarXml) + .map { strN -> + n = Integer.parseInt(strN.trim()) + step = (Integer)Math.ceil(n / numChunks) + startIndices = (0..n-1).step(step) + endIndices = startIndices[1..-1] + [n] + [startIndices, endIndices].transpose() + } + .set { startEndPairs } + // Generate evidence for each chunk and concatenate generateEvidence(clinvarXml, downloadJsonSchema.out.jsonSchema, - combineConsequences.out.consequencesCombined) - - checkDuplicates(generateEvidence.out.evidenceStrings) + combineConsequences.out.consequencesCombined, + startEndPairs.collect()) + collectEvidenceStrings(generateEvidence.out.evidenceStrings.collect()) + checkDuplicates(collectEvidenceStrings.out.evidenceStrings) convertXrefs(clinvarXml) } else { @@ -308,25 +322,37 @@ process generateAnnotatedXml { """ } +/* + * Count number of RCV records in ClinVar. + */ +process countClinvarRecords { + input: + path clinvarXml + + output: + stdout emit: count + + script: + """ + \${PYTHON_BIN} ${codeRoot}/bin/count_clinvar_rcvs.py --clinvar-xml ${clinvarXml} + """ +} + /* * Generate the evidence strings for submission to Open Targets. */ process generateEvidence { - clusterOptions "-o ${batchRoot}/logs/evidence_string_generation.out \ - -e ${batchRoot}/logs/evidence_string_generation.err" - - publishDir "${batchRoot}/evidence_strings", - overwrite: true, - mode: "copy", - pattern: "*.json" + clusterOptions "-o ${batchRoot}/logs/evidence_string_generation_${startEnd[0]}.out \ + -e ${batchRoot}/logs/evidence_string_generation_${startEnd[0]}.err" input: path clinvarXml path jsonSchema path consequenceMappings + each startEnd output: - path "evidence_strings.json", emit: evidenceStrings + path "evidence_strings_*.json", emit: evidenceStrings script: """ @@ -335,7 +361,31 @@ process generateEvidence { --efo-mapping ${params.mappings} \ --gene-mapping ${consequenceMappings} \ --ot-schema ${jsonSchema} \ - --out . + --out . \ + --start ${startEnd[0]} \ + --end ${startEnd[1]} + mv evidence_strings.json evidence_strings_${startEnd[0]}.json + """ +} + +/* + * Concatenate evidence strings into a single file. + */ +process collectEvidenceStrings { + publishDir "${batchRoot}/evidence_strings", + overwrite: true, + mode: "copy", + pattern: "*.json" + + input: + path "evidence_strings_*.json" + + output: + path "evidence_strings.json", emit: evidenceStrings + + script: + """ + cat evidence_strings_*.json >> evidence_strings.json """ } diff --git a/tests/output_generation/test_clinvar_to_evidence_strings.py b/tests/output_generation/test_clinvar_to_evidence_strings.py index d99bf7e2..940f3738 100644 --- a/tests/output_generation/test_clinvar_to_evidence_strings.py +++ b/tests/output_generation/test_clinvar_to_evidence_strings.py @@ -1,6 +1,5 @@ import json import os -import re import requests import xml.etree.ElementTree as ElementTree diff --git a/tests/pipelines/test_annotation_pipeline.sh b/tests/pipelines/test_annotation_pipeline.sh index af93a7e7..3cded4df 100644 --- a/tests/pipelines/test_annotation_pipeline.sh +++ b/tests/pipelines/test_annotation_pipeline.sh @@ -22,7 +22,8 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \ diff ${BATCH_ROOT}/gene_mapping/consequences_snp.tsv ${BATCH_ROOT_BASE}/expected/consequences_snp.tsv diff ${BATCH_ROOT}/gene_mapping/consequences_repeat.tsv ${BATCH_ROOT_BASE}/expected/consequences_repeat.tsv diff ${BATCH_ROOT}/gene_mapping/consequences_structural.tsv ${BATCH_ROOT_BASE}/expected/consequences_structural.tsv -diff ${BATCH_ROOT}/evidence_strings/evidence_strings.json ${BATCH_ROOT_BASE}/expected/evidence_strings.json +# Nextflow parallelism means ordering might not be stable +diff <(sort ${BATCH_ROOT}/evidence_strings/evidence_strings.json) <(sort ${BATCH_ROOT_BASE}/expected/evidence_strings.json) cd ${CWD} rm -r ${BATCH_ROOT} From 089ad47048b10682d23f4cd9d2caa10bed8219ff Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 4 Mar 2024 13:21:45 +0000 Subject: [PATCH 2/5] output and aggregate counts for evidence strings --- bin/aggregate_counts.py | 29 ++++ bin/count_clinvar_rcvs.py | 2 +- .../clinvar_to_evidence_strings.py | 95 +------------ cmat/output_generation/report.py | 133 ++++++++++++++++++ pipelines/annotation_pipeline.nf | 24 ++++ 5 files changed, 193 insertions(+), 90 deletions(-) create mode 100644 bin/aggregate_counts.py create mode 100644 cmat/output_generation/report.py diff --git a/bin/aggregate_counts.py b/bin/aggregate_counts.py new file mode 100644 index 00000000..894dfa10 --- /dev/null +++ b/bin/aggregate_counts.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import argparse +import glob + +from cmat.output_generation.report import Report + +parser = argparse.ArgumentParser('Aggregate counts reports') +parser.add_argument('--counts-yml', nargs='+', help='YAML files containing intermediate counts', required=True) + + +if __name__ == '__main__': + args = parser.parse_args() + + # Load all the reports + filenames = [f for files in args.counts_yml for f in glob.glob(files)] + reports = [] + for filename in filenames: + r = Report() + r.load_from_file(filename) + reports.append(r) + + # Sum them up and output the results + complete_report = sum(reports, start=Report()) + complete_report.print_report() + complete_report.dump_to_file(dir_out='.') + complete_report.write_unmapped_terms(dir_out='.') + if not complete_report.check_counts(): + raise RuntimeError('Aggregate counts not consistent') diff --git a/bin/count_clinvar_rcvs.py b/bin/count_clinvar_rcvs.py index 13b34ae5..b11a8679 100644 --- a/bin/count_clinvar_rcvs.py +++ b/bin/count_clinvar_rcvs.py @@ -5,7 +5,7 @@ from cmat.clinvar_xml_io import ClinVarDataset parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout') -parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True) +parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True) if __name__ == '__main__': diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 06ea3e0c..5d441702 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -4,12 +4,13 @@ import re import sys import os -from collections import defaultdict, Counter +from collections import defaultdict import jsonschema from cmat.clinvar_xml_io import ClinVarDataset from cmat.output_generation import consequence_type as CT +from cmat.output_generation.report import Report logger = logging.getLogger(__package__) @@ -22,91 +23,6 @@ # Output file names. EVIDENCE_STRINGS_FILE_NAME = 'evidence_strings.json' EVIDENCE_RECORDS_FILE_NAME = 'evidence_records.tsv' -UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv' - - -class Report: - """Holds counters and other records for a pipeline run.""" - - def __init__(self, trait_mappings, consequence_mappings): - self.report_strings = [] - - # The main evidence string counter. - self.evidence_string_count = 0 - # Complete evidence strings are ones with an EFO mapping. - self.complete_evidence_string_count = 0 - - # ClinVar record counters. - self.clinvar_total = 0 - self.clinvar_fatal_no_valid_traits = 0 - self.clinvar_skip_unsupported_variation = 0 - self.clinvar_skip_no_functional_consequences = 0 - self.clinvar_skip_missing_efo_mapping = 0 - self.clinvar_skip_invalid_evidence_string = 0 - self.clinvar_done_one_complete_evidence_string = 0 - self.clinvar_done_multiple_complete_evidence_strings = 0 - - # Total number of trait-to-ontology mappings present in the database. - self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()]) - # All distinct (trait name, EFO ID) mappings used in the evidence strings. - self.used_trait_mappings = set() - # All unmapped trait names which prevented evidence string generation and their counts. - self.unmapped_trait_names = Counter() - - # Variant-to-consequence mapping counts. - self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()]) - self.repeat_expansion_variants = 0 - self.structural_variants = 0 - - def print_report_and_check_counts(self): - """Print report of counts and return True if counts are consistent, False otherwise.""" - # TODO dump to a file so counts can be aggregated - # ClinVar tallies. - clinvar_fatal = self.clinvar_fatal_no_valid_traits - clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + - self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) - clinvar_done = (self.clinvar_done_one_complete_evidence_string + - self.clinvar_done_multiple_complete_evidence_strings) - - report = f'''Total number of evidence strings generated\t{self.evidence_string_count} - Total number of complete evidence strings generated\t{self.complete_evidence_string_count} - - Total number of ClinVar records\t{self.clinvar_total} - Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits} - Skipped: Can be rescued by future improvements\t{clinvar_skipped} - Unsupported variation type\t{self.clinvar_skip_unsupported_variation} - No functional consequences\t{self.clinvar_skip_no_functional_consequences} - Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping} - Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string} - Done: Generated at least one complete evidence string\t{clinvar_done} - One complete evidence string\t{self.clinvar_done_one_complete_evidence_string} - Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings} - Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{ - clinvar_done / (clinvar_skipped + clinvar_done):.1%} - - Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings} - The number of distinct trait-to-ontology mappings used in the evidence strings\t{ - len(self.used_trait_mappings)} - The number of distinct unmapped trait names which prevented complete evidence string generation\t{ - len(self.unmapped_trait_names)} - - Total number of variant to consequence mappings\t{self.total_consequence_mappings} - Number of repeat expansion variants\t{self.repeat_expansion_variants} - Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n') - print(report) - - # Confirm counts as expected, exit with error if not. - expected_total = clinvar_fatal + clinvar_skipped + clinvar_done - if expected_total != self.clinvar_total: - logger.error(f'ClinVar evidence string tallies do not add up to the total amount: ' - f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}') - return False - return True - - def write_unmapped_terms(self, dir_out): - with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file: - for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]): - unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n') def validate_evidence_string(ev_string, ot_schema_contents): @@ -131,8 +47,9 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc report, exception_raised = clinvar_to_evidence_strings( string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file, output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end) - counts_consistent = report.print_report_and_check_counts() - report.write_unmapped_terms(dir_out) + counts_consistent = report.check_counts() + report.print_report() + report.dump_to_file(dir_out) if exception_raised or not counts_consistent: sys.exit(1) @@ -217,7 +134,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings evidence_strings_generated += 1 if disease_mapped_efo_id is not None: complete_evidence_strings_generated += 1 - report.used_trait_mappings.add((disease_name, disease_mapped_efo_id)) + report.used_trait_mappings.add(disease_mapped_efo_id) if complete_evidence_strings_generated == 1: report.clinvar_done_one_complete_evidence_string += 1 diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py new file mode 100644 index 00000000..a5b0093b --- /dev/null +++ b/cmat/output_generation/report.py @@ -0,0 +1,133 @@ +import logging +import os +from collections import Counter + +import yaml + +logger = logging.getLogger(__package__) + +# Add representers to enable safe-dump +yaml.SafeDumper.add_representer(Counter, + lambda dumper, data: dumper.represent_mapping('tag:yaml.org,2002:map', data.items())) + +UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv' +COUNTS_FILE_NAME = 'counts.yml' + + +class Report: + """Holds counters and other records for a pipeline run.""" + + def __init__(self, trait_mappings=None, consequence_mappings=None): + # The main evidence string counter. + self.evidence_string_count = 0 + # Complete evidence strings are ones with an EFO mapping. + self.complete_evidence_string_count = 0 + + # ClinVar record counters. + self.clinvar_total = 0 + self.clinvar_fatal_no_valid_traits = 0 + self.clinvar_skip_unsupported_variation = 0 + self.clinvar_skip_no_functional_consequences = 0 + self.clinvar_skip_missing_efo_mapping = 0 + self.clinvar_skip_invalid_evidence_string = 0 + self.clinvar_done_one_complete_evidence_string = 0 + self.clinvar_done_multiple_complete_evidence_strings = 0 + self.clinvar_fatal = 0 + self.clinvar_skipped = 0 + self.clinvar_done = 0 + + # Total number of trait-to-ontology mappings present in the database. + self.total_trait_mappings = 0 + if trait_mappings: + self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()]) + # All distinct (trait name, EFO ID) mappings used in the evidence strings. + self.used_trait_mappings = set() + # All unmapped trait names which prevented evidence string generation and their counts. + self.unmapped_trait_names = Counter() + + # Variant-to-consequence mapping counts. + self.total_consequence_mappings = 0 + if consequence_mappings: + self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()]) + self.repeat_expansion_variants = 0 + self.structural_variants = 0 + + def __add__(self, other): + if not isinstance(other, Report): + raise ValueError('Both operands must be Report objects') + result = Report() + for var_name in vars(self).keys() | vars(other).keys(): + if var_name == 'total_trait_mappings': + result.total_trait_mappings = max(self.total_trait_mappings, other.total_trait_mappings) + if var_name == 'total_consequence_mappings': + result.total_consequence_mappings = max(self.total_consequence_mappings, + other.total_consequence_mappings) + if var_name == 'used_trait_mappings': + result.used_trait_mappings = self.used_trait_mappings | other.used_trait_mappings + else: + result.__setattr__(var_name, self.__getattribute__(var_name) + other.__getattribute__(var_name)) + return result + + def dump_to_file(self, dir_out, filename=COUNTS_FILE_NAME): + with open(os.path.join(dir_out, filename), 'w') as f: + yaml.safe_dump(vars(self), f) + + def load_from_file(self, filename): + with open(filename, 'r') as f: + data = yaml.safe_load(f) + self.__dict__.update(**data) + # yaml loads a dict, convert to counter + self.unmapped_trait_names = Counter(self.unmapped_trait_names) + + def compute_record_tallies(self): + """Compute tallies of records fatal/skipped/done based on the more granular counts.""" + self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + + self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) + self.clinvar_done = (self.clinvar_done_one_complete_evidence_string + + self.clinvar_done_multiple_complete_evidence_strings) + + def check_counts(self): + """Return True if counts are consistent, False otherwise.""" + self.compute_record_tallies() + expected_total = self.clinvar_fatal + self.clinvar_skipped + self.clinvar_done + if expected_total != self.clinvar_total: + logger.error(f'ClinVar evidence string tallies do not add up to the total amount: ' + f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}') + return False + return True + + def print_report(self): + """Print report of counts.""" + self.compute_record_tallies() + report = f'''Total number of evidence strings generated\t{self.evidence_string_count} + Total number of complete evidence strings generated\t{self.complete_evidence_string_count} + + Total number of ClinVar records\t{self.clinvar_total} + Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits} + Skipped: Can be rescued by future improvements\t{self.clinvar_skipped} + Unsupported variation type\t{self.clinvar_skip_unsupported_variation} + No functional consequences\t{self.clinvar_skip_no_functional_consequences} + Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping} + Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string} + Done: Generated at least one complete evidence string\t{self.clinvar_done} + One complete evidence string\t{self.clinvar_done_one_complete_evidence_string} + Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings} + Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{ + self.clinvar_done / (self.clinvar_skipped + self.clinvar_done):.1%} + + Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings} + The number of distinct trait-to-ontology mappings used in the evidence strings\t{ + len(self.used_trait_mappings)} + The number of distinct unmapped trait names which prevented complete evidence string generation\t{ + len(self.unmapped_trait_names)} + + Total number of variant to consequence mappings\t{self.total_consequence_mappings} + Number of repeat expansion variants\t{self.repeat_expansion_variants} + Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n') + print(report) + + def write_unmapped_terms(self, dir_out): + with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file: + for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]): + unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n') diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf index e65519d5..faab7442 100644 --- a/pipelines/annotation_pipeline.nf +++ b/pipelines/annotation_pipeline.nf @@ -74,6 +74,7 @@ workflow { combineConsequences.out.consequencesCombined, startEndPairs.collect()) collectEvidenceStrings(generateEvidence.out.evidenceStrings.collect()) + collectCounts(generateEvidence.out.countsYml.collect()) checkDuplicates(collectEvidenceStrings.out.evidenceStrings) convertXrefs(clinvarXml) @@ -353,6 +354,7 @@ process generateEvidence { output: path "evidence_strings_*.json", emit: evidenceStrings + path "counts_*.yml", emit: countsYml script: """ @@ -365,6 +367,7 @@ process generateEvidence { --start ${startEnd[0]} \ --end ${startEnd[1]} mv evidence_strings.json evidence_strings_${startEnd[0]}.json + mv counts.yml counts_${startEnd[0]}.yml """ } @@ -389,6 +392,27 @@ process collectEvidenceStrings { """ } +/* + * Aggregate counts into a single file and print the report. + */ +process collectCounts { + publishDir "${batchRoot}/logs", + overwrite: true, + mode: "copy", + pattern: "*.yml" + + input: + path "counts_*.yml" + + output: + path "counts.yml", emit: countsYml + + script: + """ + \${PYTHON_BIN} ${codeRoot}/bin/aggregate_counts.py --counts-yml counts_*.yml + """ +} + /* * Check that the generated evidence strings do not contain any duplicated evidence strings. */ From 6d6e45005d687210b2cf8858d53b9899afd53eff Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 4 Mar 2024 14:52:16 +0000 Subject: [PATCH 3/5] add tests and cleanup nextflow --- cmat/output_generation/report.py | 7 +++- pipelines/annotation_pipeline.nf | 6 +-- requirements.txt | 1 + tests/output_generation/test_report.py | 54 ++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 tests/output_generation/test_report.py diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py index a5b0093b..55d03b50 100644 --- a/cmat/output_generation/report.py +++ b/cmat/output_generation/report.py @@ -52,9 +52,14 @@ def __init__(self, trait_mappings=None, consequence_mappings=None): self.repeat_expansion_variants = 0 self.structural_variants = 0 + def __eq__(self, other): + if not isinstance(other, Report): + return NotImplemented + return self.__dict__ == other.__dict__ + def __add__(self, other): if not isinstance(other, Report): - raise ValueError('Both operands must be Report objects') + return NotImplemented result = Report() for var_name in vars(self).keys() | vars(other).keys(): if var_name == 'total_trait_mappings': diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf index faab7442..28454081 100644 --- a/pipelines/annotation_pipeline.nf +++ b/pipelines/annotation_pipeline.nf @@ -353,8 +353,8 @@ process generateEvidence { each startEnd output: - path "evidence_strings_*.json", emit: evidenceStrings - path "counts_*.yml", emit: countsYml + path "evidence_strings.json", emit: evidenceStrings + path "counts.yml", emit: countsYml script: """ @@ -366,8 +366,6 @@ process generateEvidence { --out . \ --start ${startEnd[0]} \ --end ${startEnd[1]} - mv evidence_strings.json evidence_strings_${startEnd[0]}.json - mv counts.yml counts_${startEnd[0]}.yml """ } diff --git a/requirements.txt b/requirements.txt index 99f5a47c..3694df5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ numpy==1.24.3 pandas==1.5.3 pytest==7.2.2 pytest-cov==2.10.0 +pyyaml==6.0.1 requests==2.31.0 requests-mock==1.8.0 retry==0.9.2 diff --git a/tests/output_generation/test_report.py b/tests/output_generation/test_report.py new file mode 100644 index 00000000..661fdb35 --- /dev/null +++ b/tests/output_generation/test_report.py @@ -0,0 +1,54 @@ +import os +from collections import Counter + +from cmat.output_generation.report import Report + + +resources_dir = os.path.join(os.path.dirname(__file__), 'resources') + + +def get_test_report(): + report = Report() + report.evidence_string_count = 42 + report.used_trait_mappings = {'one', 'two', 'three'} + report.unmapped_trait_names = Counter('abracadabra') + return report + + +def test_dump_and_load(): + counts_file = os.path.join(resources_dir, 'counts.yml') + original_report = get_test_report() + original_report.dump_to_file(resources_dir) + + loaded_report = Report() + loaded_report.load_from_file(counts_file) + assert original_report == loaded_report + + if os.path.exists(counts_file): + os.remove(counts_file) + + +def test_add_operator(): + report_1 = get_test_report() + + report_2 = Report() + report_2.evidence_string_count = 13 + report_2.total_trait_mappings = 100 + report_2.used_trait_mappings = {'three', 'four'} + report_2.unmapped_trait_names = Counter('alakazam') + + combined = report_1 + report_2 + assert combined.clinvar_total == 0 + assert combined.evidence_string_count == 55 + assert combined.total_trait_mappings == 100 + assert combined.used_trait_mappings == {'one', 'two', 'three', 'four'} + assert combined.unmapped_trait_names == Counter( + {'a': 9, 'b': 2, 'r': 2, 'c': 1, 'd': 1, 'l': 1, 'k': 1, 'z': 1, 'm': 1} + ) + + +def test_check_counts(): + report = get_test_report() + assert report.check_counts() + report.clinvar_total = 5 + assert not report.check_counts() From ddd9eaa9490c8c50edc26ec0ca9be88946cd19ef Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 6 Mar 2024 09:23:29 +0000 Subject: [PATCH 4/5] adjust counts to appropriately skip invalid clinical significance values --- cmat/clinvar_xml_io/clinvar_record.py | 6 +++++ .../clinvar_to_evidence_strings.py | 26 ++++++++++++------- cmat/output_generation/report.py | 4 ++- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index da653db8..aca234e3 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -34,6 +34,8 @@ class ClinVarRecord: # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful. NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'} + # Some records have been flagged by ClinVar and should not be used. + INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'} def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure): """Initialise a ClinVar record object from an RCV XML record.""" @@ -143,6 +145,10 @@ def clinical_significance_list(self): See /data-exploration/clinvar-variant-types/README.md for further explanation.""" return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' '))))) + @property + def valid_clinical_significances(self): + return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES] + @property def allele_origins(self): return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')} diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 5d441702..b1cb25c2 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -82,20 +82,25 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings if not clinvar_record.traits_with_valid_names: report.clinvar_fatal_no_valid_traits += 1 continue + # Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to + # submissions being flagged. + if not clinvar_record.valid_clinical_significances: + report.clinvar_fatal_no_clinical_significance += 1 + continue - # Failure mode 2 (skip). A ClinVar record contains an unsupported variation type. + # Failure mode 3 (skip). A ClinVar record contains an unsupported variation type. if clinvar_record.measure is None: report.clinvar_skip_unsupported_variation += 1 continue - # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele - # origins, (2) EFO mappings, and (3) genes where the variant has effect. + # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid + # allele origins, (2) EFO mappings, and (3) genes where the variant has effect. grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins) consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings) grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names, string_to_efo_mappings) - # Failure mode 3 (skip). No functional consequences are available. + # Failure mode 4 (skip). No functional consequences are available. if not consequence_types: report.clinvar_skip_no_functional_consequences += 1 continue @@ -106,9 +111,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings if is_structural_variant(clinvar_record.measure): report.structural_variants += len(consequence_types) - # Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable - # EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a - # failure so we can continue to measure mapping coverage. + # Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no + # suitable EFO mappings were found in the database. This will still generate an evidence string, but is + # tracked as a failure so we can continue to measure mapping coverage. if not any(group[-1] for group in grouped_diseases): report.clinvar_skip_missing_efo_mapping += 1 unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name @@ -122,8 +127,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings for allele_origins, disease_attributes, consequence_attributes in itertools.product( grouped_allele_origins, grouped_diseases, consequence_types): disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes - evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, - disease_mapped_efo_id, consequence_attributes) + evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, + disease_source_id, disease_mapped_efo_id, + consequence_attributes) # Validate and immediately output the evidence string (not keeping everything in memory). is_valid = validate_evidence_string(evidence_string, ot_schema_contents) @@ -185,7 +191,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea 'allelicRequirements': clinvar_record.mode_of_inheritance, # Levels of clinical significance reported for the variant. - 'clinicalSignificances': clinvar_record.clinical_significance_list, + 'clinicalSignificances': clinvar_record.valid_clinical_significances, # Confidence (review status). 'confidence': clinvar_record.review_status, diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py index 55d03b50..fcd37742 100644 --- a/cmat/output_generation/report.py +++ b/cmat/output_generation/report.py @@ -26,6 +26,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None): # ClinVar record counters. self.clinvar_total = 0 self.clinvar_fatal_no_valid_traits = 0 + self.clinvar_fatal_no_clinical_significance = 0 self.clinvar_skip_unsupported_variation = 0 self.clinvar_skip_no_functional_consequences = 0 self.clinvar_skip_missing_efo_mapping = 0 @@ -86,7 +87,7 @@ def load_from_file(self, filename): def compute_record_tallies(self): """Compute tallies of records fatal/skipped/done based on the more granular counts.""" - self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) self.clinvar_done = (self.clinvar_done_one_complete_evidence_string + @@ -110,6 +111,7 @@ def print_report(self): Total number of ClinVar records\t{self.clinvar_total} Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits} + No clinical significance\t{self.clinvar_fatal_no_clinical_significance} Skipped: Can be rescued by future improvements\t{self.clinvar_skipped} Unsupported variation type\t{self.clinvar_skip_unsupported_variation} No functional consequences\t{self.clinvar_skip_no_functional_consequences} From 0668213c85e09514755bdd1ff2f64bced1418c37 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 6 Mar 2024 16:23:08 +0000 Subject: [PATCH 5/5] bugfix for summing reports --- cmat/output_generation/report.py | 4 ++-- tests/output_generation/test_report.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py index fcd37742..f7b404c7 100644 --- a/cmat/output_generation/report.py +++ b/cmat/output_generation/report.py @@ -65,10 +65,10 @@ def __add__(self, other): for var_name in vars(self).keys() | vars(other).keys(): if var_name == 'total_trait_mappings': result.total_trait_mappings = max(self.total_trait_mappings, other.total_trait_mappings) - if var_name == 'total_consequence_mappings': + elif var_name == 'total_consequence_mappings': result.total_consequence_mappings = max(self.total_consequence_mappings, other.total_consequence_mappings) - if var_name == 'used_trait_mappings': + elif var_name == 'used_trait_mappings': result.used_trait_mappings = self.used_trait_mappings | other.used_trait_mappings else: result.__setattr__(var_name, self.__getattribute__(var_name) + other.__getattribute__(var_name)) diff --git a/tests/output_generation/test_report.py b/tests/output_generation/test_report.py index 661fdb35..f356ecce 100644 --- a/tests/output_generation/test_report.py +++ b/tests/output_generation/test_report.py @@ -30,6 +30,7 @@ def test_dump_and_load(): def test_add_operator(): report_1 = get_test_report() + report_1.total_trait_mappings = 100 report_2 = Report() report_2.evidence_string_count = 13