diff --git a/bin/aggregate_counts.py b/bin/aggregate_counts.py new file mode 100644 index 00000000..894dfa10 --- /dev/null +++ b/bin/aggregate_counts.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import argparse +import glob + +from cmat.output_generation.report import Report + +parser = argparse.ArgumentParser('Aggregate counts reports') +parser.add_argument('--counts-yml', nargs='+', help='YAML files containing intermediate counts', required=True) + + +if __name__ == '__main__': + args = parser.parse_args() + + # Load all the reports + filenames = [f for files in args.counts_yml for f in glob.glob(files)] + reports = [] + for filename in filenames: + r = Report() + r.load_from_file(filename) + reports.append(r) + + # Sum them up and output the results + complete_report = sum(reports, start=Report()) + complete_report.print_report() + complete_report.dump_to_file(dir_out='.') + complete_report.write_unmapped_terms(dir_out='.') + if not complete_report.check_counts(): + raise RuntimeError('Aggregate counts not consistent') diff --git a/bin/count_clinvar_rcvs.py b/bin/count_clinvar_rcvs.py new file mode 100644 index 00000000..b11a8679 --- /dev/null +++ b/bin/count_clinvar_rcvs.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import argparse + +from cmat.clinvar_xml_io import ClinVarDataset + +parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout') +parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True) + + +if __name__ == '__main__': + args = parser.parse_args() + print(sum(1 for _ in ClinVarDataset(args.clinvar_xml))) diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py index 6b6aabf4..28893942 100755 --- a/bin/evidence_string_generation.py +++ b/bin/evidence_string_generation.py @@ -9,10 +9,12 @@ parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True) parser.add_argument('--ot-schema', help='OpenTargets schema JSON', required=True) parser.add_argument('--out', help='Output directory', required=True) +parser.add_argument('--start', help='Start index (inclusive)', required=False, type=int) +parser.add_argument('--end', help='End index (exclusive)', required=False, type=int) if __name__ == '__main__': args = parser.parse_args() clinvar_to_evidence_strings.launch_pipeline( clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping, - ot_schema_file=args.ot_schema, dir_out=args.out) + ot_schema_file=args.ot_schema, dir_out=args.out, start=args.start, end=args.end) diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index da653db8..aca234e3 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -34,6 +34,8 @@ class ClinVarRecord: # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful. NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'} + # Some records have been flagged by ClinVar and should not be used. + INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'} def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure): """Initialise a ClinVar record object from an RCV XML record.""" @@ -143,6 +145,10 @@ def clinical_significance_list(self): See /data-exploration/clinvar-variant-types/README.md for further explanation.""" return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' '))))) + @property + def valid_clinical_significances(self): + return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES] + @property def allele_origins(self): return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')} diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 74ffd48f..b1cb25c2 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -4,12 +4,13 @@ import re import sys import os -from collections import defaultdict, Counter +from collections import defaultdict import jsonschema from cmat.clinvar_xml_io import ClinVarDataset from cmat.output_generation import consequence_type as CT +from cmat.output_generation.report import Report logger = logging.getLogger(__package__) @@ -22,90 +23,6 @@ # Output file names. EVIDENCE_STRINGS_FILE_NAME = 'evidence_strings.json' EVIDENCE_RECORDS_FILE_NAME = 'evidence_records.tsv' -UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv' - - -class Report: - """Holds counters and other records for a pipeline run.""" - - def __init__(self, trait_mappings, consequence_mappings): - self.report_strings = [] - - # The main evidence string counter. - self.evidence_string_count = 0 - # Complete evidence strings are ones with an EFO mapping. - self.complete_evidence_string_count = 0 - - # ClinVar record counters. - self.clinvar_total = 0 - self.clinvar_fatal_no_valid_traits = 0 - self.clinvar_skip_unsupported_variation = 0 - self.clinvar_skip_no_functional_consequences = 0 - self.clinvar_skip_missing_efo_mapping = 0 - self.clinvar_skip_invalid_evidence_string = 0 - self.clinvar_done_one_complete_evidence_string = 0 - self.clinvar_done_multiple_complete_evidence_strings = 0 - - # Total number of trait-to-ontology mappings present in the database. - self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()]) - # All distinct (trait name, EFO ID) mappings used in the evidence strings. - self.used_trait_mappings = set() - # All unmapped trait names which prevented evidence string generation and their counts. - self.unmapped_trait_names = Counter() - - # Variant-to-consequence mapping counts. - self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()]) - self.repeat_expansion_variants = 0 - self.structural_variants = 0 - - def print_report_and_check_counts(self): - """Print report of counts and return True if counts are consistent, False otherwise.""" - # ClinVar tallies. - clinvar_fatal = self.clinvar_fatal_no_valid_traits - clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + - self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) - clinvar_done = (self.clinvar_done_one_complete_evidence_string + - self.clinvar_done_multiple_complete_evidence_strings) - - report = f'''Total number of evidence strings generated\t{self.evidence_string_count} - Total number of complete evidence strings generated\t{self.complete_evidence_string_count} - - Total number of ClinVar records\t{self.clinvar_total} - Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits} - Skipped: Can be rescued by future improvements\t{clinvar_skipped} - Unsupported variation type\t{self.clinvar_skip_unsupported_variation} - No functional consequences\t{self.clinvar_skip_no_functional_consequences} - Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping} - Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string} - Done: Generated at least one complete evidence string\t{clinvar_done} - One complete evidence string\t{self.clinvar_done_one_complete_evidence_string} - Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings} - Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{ - clinvar_done / (clinvar_skipped + clinvar_done):.1%} - - Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings} - The number of distinct trait-to-ontology mappings used in the evidence strings\t{ - len(self.used_trait_mappings)} - The number of distinct unmapped trait names which prevented complete evidence string generation\t{ - len(self.unmapped_trait_names)} - - Total number of variant to consequence mappings\t{self.total_consequence_mappings} - Number of repeat expansion variants\t{self.repeat_expansion_variants} - Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n') - print(report) - - # Confirm counts as expected, exit with error if not. - expected_total = clinvar_fatal + clinvar_skipped + clinvar_done - if expected_total != self.clinvar_total: - logger.error(f'ClinVar evidence string tallies do not add up to the total amount: ' - f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}') - return False - return True - - def write_unmapped_terms(self, dir_out): - with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file: - for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]): - unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n') def validate_evidence_string(ev_string, ot_schema_contents): @@ -122,29 +39,38 @@ def validate_evidence_string(ev_string, ot_schema_contents): sys.exit(1) -def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out): +def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, start, end): os.makedirs(dir_out, exist_ok=True) string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file) variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file) report, exception_raised = clinvar_to_evidence_strings( string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file, - output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME)) - counts_consistent = report.print_report_and_check_counts() - report.write_unmapped_terms(dir_out) + output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end) + counts_consistent = report.check_counts() + report.print_report() + report.dump_to_file(dir_out) if exception_raised or not counts_consistent: sys.exit(1) def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema, - output_evidence_strings): + output_evidence_strings, start=None, end=None): report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = open(output_evidence_strings, 'wt') exception_raised = False logger.info('Processing ClinVar records') + i = -1 for clinvar_record in ClinVarDataset(clinvar_xml): + # If start & end provided, only process records in the range [start, end) + i += 1 + if start and i < start: + continue + if end and i >= end: + break + report.clinvar_total += 1 if report.clinvar_total % 1000 == 0: logger.info(f'{report.clinvar_total} records processed') @@ -156,20 +82,25 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings if not clinvar_record.traits_with_valid_names: report.clinvar_fatal_no_valid_traits += 1 continue + # Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to + # submissions being flagged. + if not clinvar_record.valid_clinical_significances: + report.clinvar_fatal_no_clinical_significance += 1 + continue - # Failure mode 2 (skip). A ClinVar record contains an unsupported variation type. + # Failure mode 3 (skip). A ClinVar record contains an unsupported variation type. if clinvar_record.measure is None: report.clinvar_skip_unsupported_variation += 1 continue - # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele - # origins, (2) EFO mappings, and (3) genes where the variant has effect. + # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid + # allele origins, (2) EFO mappings, and (3) genes where the variant has effect. grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins) consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings) grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names, string_to_efo_mappings) - # Failure mode 3 (skip). No functional consequences are available. + # Failure mode 4 (skip). No functional consequences are available. if not consequence_types: report.clinvar_skip_no_functional_consequences += 1 continue @@ -180,9 +111,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings if is_structural_variant(clinvar_record.measure): report.structural_variants += len(consequence_types) - # Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable - # EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a - # failure so we can continue to measure mapping coverage. + # Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no + # suitable EFO mappings were found in the database. This will still generate an evidence string, but is + # tracked as a failure so we can continue to measure mapping coverage. if not any(group[-1] for group in grouped_diseases): report.clinvar_skip_missing_efo_mapping += 1 unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name @@ -196,8 +127,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings for allele_origins, disease_attributes, consequence_attributes in itertools.product( grouped_allele_origins, grouped_diseases, consequence_types): disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes - evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id, - disease_mapped_efo_id, consequence_attributes) + evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, + disease_source_id, disease_mapped_efo_id, + consequence_attributes) # Validate and immediately output the evidence string (not keeping everything in memory). is_valid = validate_evidence_string(evidence_string, ot_schema_contents) @@ -208,7 +140,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings evidence_strings_generated += 1 if disease_mapped_efo_id is not None: complete_evidence_strings_generated += 1 - report.used_trait_mappings.add((disease_name, disease_mapped_efo_id)) + report.used_trait_mappings.add(disease_mapped_efo_id) if complete_evidence_strings_generated == 1: report.clinvar_done_one_complete_evidence_string += 1 @@ -259,7 +191,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea 'allelicRequirements': clinvar_record.mode_of_inheritance, # Levels of clinical significance reported for the variant. - 'clinicalSignificances': clinvar_record.clinical_significance_list, + 'clinicalSignificances': clinvar_record.valid_clinical_significances, # Confidence (review status). 'confidence': clinvar_record.review_status, diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py new file mode 100644 index 00000000..f7b404c7 --- /dev/null +++ b/cmat/output_generation/report.py @@ -0,0 +1,140 @@ +import logging +import os +from collections import Counter + +import yaml + +logger = logging.getLogger(__package__) + +# Add representers to enable safe-dump +yaml.SafeDumper.add_representer(Counter, + lambda dumper, data: dumper.represent_mapping('tag:yaml.org,2002:map', data.items())) + +UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv' +COUNTS_FILE_NAME = 'counts.yml' + + +class Report: + """Holds counters and other records for a pipeline run.""" + + def __init__(self, trait_mappings=None, consequence_mappings=None): + # The main evidence string counter. + self.evidence_string_count = 0 + # Complete evidence strings are ones with an EFO mapping. + self.complete_evidence_string_count = 0 + + # ClinVar record counters. + self.clinvar_total = 0 + self.clinvar_fatal_no_valid_traits = 0 + self.clinvar_fatal_no_clinical_significance = 0 + self.clinvar_skip_unsupported_variation = 0 + self.clinvar_skip_no_functional_consequences = 0 + self.clinvar_skip_missing_efo_mapping = 0 + self.clinvar_skip_invalid_evidence_string = 0 + self.clinvar_done_one_complete_evidence_string = 0 + self.clinvar_done_multiple_complete_evidence_strings = 0 + self.clinvar_fatal = 0 + self.clinvar_skipped = 0 + self.clinvar_done = 0 + + # Total number of trait-to-ontology mappings present in the database. + self.total_trait_mappings = 0 + if trait_mappings: + self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()]) + # All distinct (trait name, EFO ID) mappings used in the evidence strings. + self.used_trait_mappings = set() + # All unmapped trait names which prevented evidence string generation and their counts. + self.unmapped_trait_names = Counter() + + # Variant-to-consequence mapping counts. + self.total_consequence_mappings = 0 + if consequence_mappings: + self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()]) + self.repeat_expansion_variants = 0 + self.structural_variants = 0 + + def __eq__(self, other): + if not isinstance(other, Report): + return NotImplemented + return self.__dict__ == other.__dict__ + + def __add__(self, other): + if not isinstance(other, Report): + return NotImplemented + result = Report() + for var_name in vars(self).keys() | vars(other).keys(): + if var_name == 'total_trait_mappings': + result.total_trait_mappings = max(self.total_trait_mappings, other.total_trait_mappings) + elif var_name == 'total_consequence_mappings': + result.total_consequence_mappings = max(self.total_consequence_mappings, + other.total_consequence_mappings) + elif var_name == 'used_trait_mappings': + result.used_trait_mappings = self.used_trait_mappings | other.used_trait_mappings + else: + result.__setattr__(var_name, self.__getattribute__(var_name) + other.__getattribute__(var_name)) + return result + + def dump_to_file(self, dir_out, filename=COUNTS_FILE_NAME): + with open(os.path.join(dir_out, filename), 'w') as f: + yaml.safe_dump(vars(self), f) + + def load_from_file(self, filename): + with open(filename, 'r') as f: + data = yaml.safe_load(f) + self.__dict__.update(**data) + # yaml loads a dict, convert to counter + self.unmapped_trait_names = Counter(self.unmapped_trait_names) + + def compute_record_tallies(self): + """Compute tallies of records fatal/skipped/done based on the more granular counts.""" + self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance + self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + + self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) + self.clinvar_done = (self.clinvar_done_one_complete_evidence_string + + self.clinvar_done_multiple_complete_evidence_strings) + + def check_counts(self): + """Return True if counts are consistent, False otherwise.""" + self.compute_record_tallies() + expected_total = self.clinvar_fatal + self.clinvar_skipped + self.clinvar_done + if expected_total != self.clinvar_total: + logger.error(f'ClinVar evidence string tallies do not add up to the total amount: ' + f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}') + return False + return True + + def print_report(self): + """Print report of counts.""" + self.compute_record_tallies() + report = f'''Total number of evidence strings generated\t{self.evidence_string_count} + Total number of complete evidence strings generated\t{self.complete_evidence_string_count} + + Total number of ClinVar records\t{self.clinvar_total} + Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits} + No clinical significance\t{self.clinvar_fatal_no_clinical_significance} + Skipped: Can be rescued by future improvements\t{self.clinvar_skipped} + Unsupported variation type\t{self.clinvar_skip_unsupported_variation} + No functional consequences\t{self.clinvar_skip_no_functional_consequences} + Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping} + Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string} + Done: Generated at least one complete evidence string\t{self.clinvar_done} + One complete evidence string\t{self.clinvar_done_one_complete_evidence_string} + Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings} + Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{ + self.clinvar_done / (self.clinvar_skipped + self.clinvar_done):.1%} + + Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings} + The number of distinct trait-to-ontology mappings used in the evidence strings\t{ + len(self.used_trait_mappings)} + The number of distinct unmapped trait names which prevented complete evidence string generation\t{ + len(self.unmapped_trait_names)} + + Total number of variant to consequence mappings\t{self.total_consequence_mappings} + Number of repeat expansion variants\t{self.repeat_expansion_variants} + Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n') + print(report) + + def write_unmapped_terms(self, dir_out): + with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file: + for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]): + unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n') diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf index c7b5411b..28454081 100644 --- a/pipelines/annotation_pipeline.nf +++ b/pipelines/annotation_pipeline.nf @@ -34,6 +34,8 @@ if (!params.output_dir) { batchRoot = params.output_dir codeRoot = "${projectDir}/.." includeTranscriptsFlag = params.include_transcripts ? "--include-transcripts" : "" +// Number of chunks to run in parallel for evidence generation +numChunks = 10 /* * Main workflow. @@ -56,11 +58,24 @@ workflow { if (params.schema != null) { // Open Targets evidence string output downloadJsonSchema() + // Get start/end indices to break XML into chunks + countClinvarRecords(clinvarXml) + .map { strN -> + n = Integer.parseInt(strN.trim()) + step = (Integer)Math.ceil(n / numChunks) + startIndices = (0..n-1).step(step) + endIndices = startIndices[1..-1] + [n] + [startIndices, endIndices].transpose() + } + .set { startEndPairs } + // Generate evidence for each chunk and concatenate generateEvidence(clinvarXml, downloadJsonSchema.out.jsonSchema, - combineConsequences.out.consequencesCombined) - - checkDuplicates(generateEvidence.out.evidenceStrings) + combineConsequences.out.consequencesCombined, + startEndPairs.collect()) + collectEvidenceStrings(generateEvidence.out.evidenceStrings.collect()) + collectCounts(generateEvidence.out.countsYml.collect()) + checkDuplicates(collectEvidenceStrings.out.evidenceStrings) convertXrefs(clinvarXml) } else { @@ -308,25 +323,38 @@ process generateAnnotatedXml { """ } +/* + * Count number of RCV records in ClinVar. + */ +process countClinvarRecords { + input: + path clinvarXml + + output: + stdout emit: count + + script: + """ + \${PYTHON_BIN} ${codeRoot}/bin/count_clinvar_rcvs.py --clinvar-xml ${clinvarXml} + """ +} + /* * Generate the evidence strings for submission to Open Targets. */ process generateEvidence { - clusterOptions "-o ${batchRoot}/logs/evidence_string_generation.out \ - -e ${batchRoot}/logs/evidence_string_generation.err" - - publishDir "${batchRoot}/evidence_strings", - overwrite: true, - mode: "copy", - pattern: "*.json" + clusterOptions "-o ${batchRoot}/logs/evidence_string_generation_${startEnd[0]}.out \ + -e ${batchRoot}/logs/evidence_string_generation_${startEnd[0]}.err" input: path clinvarXml path jsonSchema path consequenceMappings + each startEnd output: path "evidence_strings.json", emit: evidenceStrings + path "counts.yml", emit: countsYml script: """ @@ -335,7 +363,51 @@ process generateEvidence { --efo-mapping ${params.mappings} \ --gene-mapping ${consequenceMappings} \ --ot-schema ${jsonSchema} \ - --out . + --out . \ + --start ${startEnd[0]} \ + --end ${startEnd[1]} + """ +} + +/* + * Concatenate evidence strings into a single file. + */ +process collectEvidenceStrings { + publishDir "${batchRoot}/evidence_strings", + overwrite: true, + mode: "copy", + pattern: "*.json" + + input: + path "evidence_strings_*.json" + + output: + path "evidence_strings.json", emit: evidenceStrings + + script: + """ + cat evidence_strings_*.json >> evidence_strings.json + """ +} + +/* + * Aggregate counts into a single file and print the report. + */ +process collectCounts { + publishDir "${batchRoot}/logs", + overwrite: true, + mode: "copy", + pattern: "*.yml" + + input: + path "counts_*.yml" + + output: + path "counts.yml", emit: countsYml + + script: + """ + \${PYTHON_BIN} ${codeRoot}/bin/aggregate_counts.py --counts-yml counts_*.yml """ } diff --git a/requirements.txt b/requirements.txt index 99f5a47c..3694df5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ numpy==1.24.3 pandas==1.5.3 pytest==7.2.2 pytest-cov==2.10.0 +pyyaml==6.0.1 requests==2.31.0 requests-mock==1.8.0 retry==0.9.2 diff --git a/tests/output_generation/test_clinvar_to_evidence_strings.py b/tests/output_generation/test_clinvar_to_evidence_strings.py index d99bf7e2..940f3738 100644 --- a/tests/output_generation/test_clinvar_to_evidence_strings.py +++ b/tests/output_generation/test_clinvar_to_evidence_strings.py @@ -1,6 +1,5 @@ import json import os -import re import requests import xml.etree.ElementTree as ElementTree diff --git a/tests/output_generation/test_report.py b/tests/output_generation/test_report.py new file mode 100644 index 00000000..f356ecce --- /dev/null +++ b/tests/output_generation/test_report.py @@ -0,0 +1,55 @@ +import os +from collections import Counter + +from cmat.output_generation.report import Report + + +resources_dir = os.path.join(os.path.dirname(__file__), 'resources') + + +def get_test_report(): + report = Report() + report.evidence_string_count = 42 + report.used_trait_mappings = {'one', 'two', 'three'} + report.unmapped_trait_names = Counter('abracadabra') + return report + + +def test_dump_and_load(): + counts_file = os.path.join(resources_dir, 'counts.yml') + original_report = get_test_report() + original_report.dump_to_file(resources_dir) + + loaded_report = Report() + loaded_report.load_from_file(counts_file) + assert original_report == loaded_report + + if os.path.exists(counts_file): + os.remove(counts_file) + + +def test_add_operator(): + report_1 = get_test_report() + report_1.total_trait_mappings = 100 + + report_2 = Report() + report_2.evidence_string_count = 13 + report_2.total_trait_mappings = 100 + report_2.used_trait_mappings = {'three', 'four'} + report_2.unmapped_trait_names = Counter('alakazam') + + combined = report_1 + report_2 + assert combined.clinvar_total == 0 + assert combined.evidence_string_count == 55 + assert combined.total_trait_mappings == 100 + assert combined.used_trait_mappings == {'one', 'two', 'three', 'four'} + assert combined.unmapped_trait_names == Counter( + {'a': 9, 'b': 2, 'r': 2, 'c': 1, 'd': 1, 'l': 1, 'k': 1, 'z': 1, 'm': 1} + ) + + +def test_check_counts(): + report = get_test_report() + assert report.check_counts() + report.clinvar_total = 5 + assert not report.check_counts() diff --git a/tests/pipelines/test_annotation_pipeline.sh b/tests/pipelines/test_annotation_pipeline.sh index af93a7e7..3cded4df 100644 --- a/tests/pipelines/test_annotation_pipeline.sh +++ b/tests/pipelines/test_annotation_pipeline.sh @@ -22,7 +22,8 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \ diff ${BATCH_ROOT}/gene_mapping/consequences_snp.tsv ${BATCH_ROOT_BASE}/expected/consequences_snp.tsv diff ${BATCH_ROOT}/gene_mapping/consequences_repeat.tsv ${BATCH_ROOT_BASE}/expected/consequences_repeat.tsv diff ${BATCH_ROOT}/gene_mapping/consequences_structural.tsv ${BATCH_ROOT_BASE}/expected/consequences_structural.tsv -diff ${BATCH_ROOT}/evidence_strings/evidence_strings.json ${BATCH_ROOT_BASE}/expected/evidence_strings.json +# Nextflow parallelism means ordering might not be stable +diff <(sort ${BATCH_ROOT}/evidence_strings/evidence_strings.json) <(sort ${BATCH_ROOT_BASE}/expected/evidence_strings.json) cd ${CWD} rm -r ${BATCH_ROOT}