EBIvariation · apriltuesday · Mar 7, 2024 · Feb 29, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/bin/aggregate_counts.py b/bin/aggregate_counts.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import argparse
+import glob
+
+from cmat.output_generation.report import Report
+
+parser = argparse.ArgumentParser('Aggregate counts reports')
+parser.add_argument('--counts-yml', nargs='+', help='YAML files containing intermediate counts', required=True)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+
+    # Load all the reports
+    filenames = [f for files in args.counts_yml for f in glob.glob(files)]
+    reports = []
+    for filename in filenames:
+        r = Report()
+        r.load_from_file(filename)
+        reports.append(r)
+
+    # Sum them up and output the results
+    complete_report = sum(reports, start=Report())
+    complete_report.print_report()
+    complete_report.dump_to_file(dir_out='.')
+    complete_report.write_unmapped_terms(dir_out='.')
+    if not complete_report.check_counts():
+        raise RuntimeError('Aggregate counts not consistent')
diff --git a/bin/count_clinvar_rcvs.py b/bin/count_clinvar_rcvs.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+import argparse
+
+from cmat.clinvar_xml_io import ClinVarDataset
+
+parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout')
+parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print(sum(1 for _ in ClinVarDataset(args.clinvar_xml)))
diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py
@@ -9,10 +9,12 @@
 parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
 parser.add_argument('--ot-schema',    help='OpenTargets schema JSON',                required=True)
 parser.add_argument('--out',          help='Output directory',                       required=True)
+parser.add_argument('--start',        help='Start index (inclusive)',                required=False, type=int)
+parser.add_argument('--end',          help='End index (exclusive)',                  required=False, type=int)
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     clinvar_to_evidence_strings.launch_pipeline(
         clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
-        ot_schema_file=args.ot_schema, dir_out=args.out)
+        ot_schema_file=args.ot_schema, dir_out=args.out, start=args.start, end=args.end)
diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -4,12 +4,13 @@
 import re
 import sys
 import os
-from collections import defaultdict, Counter
+from collections import defaultdict
 
 import jsonschema
 
 from cmat.clinvar_xml_io import ClinVarDataset
 from cmat.output_generation import consequence_type as CT
+from cmat.output_generation.report import Report
 
 logger = logging.getLogger(__package__)
 
@@ -22,90 +23,6 @@
 # Output file names.
 EVIDENCE_STRINGS_FILE_NAME = 'evidence_strings.json'
 EVIDENCE_RECORDS_FILE_NAME = 'evidence_records.tsv'
-UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'
-
-
-class Report:
-    """Holds counters and other records for a pipeline run."""
-
-    def __init__(self, trait_mappings, consequence_mappings):
-        self.report_strings = []
-
-        # The main evidence string counter.
-        self.evidence_string_count = 0
-        # Complete evidence strings are ones with an EFO mapping.
-        self.complete_evidence_string_count = 0
-
-        # ClinVar record counters.
-        self.clinvar_total = 0
-        self.clinvar_fatal_no_valid_traits = 0
-        self.clinvar_skip_unsupported_variation = 0
-        self.clinvar_skip_no_functional_consequences = 0
-        self.clinvar_skip_missing_efo_mapping = 0
-        self.clinvar_skip_invalid_evidence_string = 0
-        self.clinvar_done_one_complete_evidence_string = 0
-        self.clinvar_done_multiple_complete_evidence_strings = 0
-
-        # Total number of trait-to-ontology mappings present in the database.
-        self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
-        # All distinct (trait name, EFO ID) mappings used in the evidence strings.
-        self.used_trait_mappings = set()
-        # All unmapped trait names which prevented evidence string generation and their counts.
-        self.unmapped_trait_names = Counter()
-
-        # Variant-to-consequence mapping counts.
-        self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
-        self.repeat_expansion_variants = 0
-        self.structural_variants = 0
-
-    def print_report_and_check_counts(self):
-        """Print report of counts and return True if counts are consistent, False otherwise."""
-        # ClinVar tallies.
-        clinvar_fatal = self.clinvar_fatal_no_valid_traits
-        clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
-                           self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
-        clinvar_done = (self.clinvar_done_one_complete_evidence_string +
-                        self.clinvar_done_multiple_complete_evidence_strings)
-
-        report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
-            Total number of complete evidence strings generated\t{self.complete_evidence_string_count}
-
-            Total number of ClinVar records\t{self.clinvar_total}
-                Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
-                Skipped: Can be rescued by future improvements\t{clinvar_skipped}
-                    Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
-                    No functional consequences\t{self.clinvar_skip_no_functional_consequences}
-                    Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
-                    Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
-                Done: Generated at least one complete evidence string\t{clinvar_done}
-                    One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
-                    Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
-            Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
-                clinvar_done / (clinvar_skipped + clinvar_done):.1%}
-
-            Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
-                The number of distinct trait-to-ontology mappings used in the evidence strings\t{
-                    len(self.used_trait_mappings)}
-            The number of distinct unmapped trait names which prevented complete evidence string generation\t{
-                len(self.unmapped_trait_names)}
-
-            Total number of variant to consequence mappings\t{self.total_consequence_mappings}
-                Number of repeat expansion variants\t{self.repeat_expansion_variants}
-                Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
-        print(report)
-
-        # Confirm counts as expected, exit with error if not.
-        expected_total = clinvar_fatal + clinvar_skipped + clinvar_done
-        if expected_total != self.clinvar_total:
-            logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
-                         f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
-            return False
-        return True
-
-    def write_unmapped_terms(self, dir_out):
-        with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
-            for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
-                unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')
 
 
 def validate_evidence_string(ev_string, ot_schema_contents):
@@ -122,29 +39,38 @@ def validate_evidence_string(ev_string, ot_schema_contents):
         sys.exit(1)
 
 
-def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
+def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, start, end):
     os.makedirs(dir_out, exist_ok=True)
     string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file)
     variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
 
     report, exception_raised = clinvar_to_evidence_strings(
         string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
-        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
-    counts_consistent = report.print_report_and_check_counts()
-    report.write_unmapped_terms(dir_out)
+        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end)
+    counts_consistent = report.check_counts()
+    report.print_report()
+    report.dump_to_file(dir_out)
     if exception_raised or not counts_consistent:
         sys.exit(1)
 
 
 def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
-                                output_evidence_strings):
+                                output_evidence_strings, start=None, end=None):
     report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
     ot_schema_contents = json.loads(open(ot_schema).read())
     output_evidence_strings_file = open(output_evidence_strings, 'wt')
     exception_raised = False
 
     logger.info('Processing ClinVar records')
+    i = -1
     for clinvar_record in ClinVarDataset(clinvar_xml):
+        # If start & end provided, only process records in the range [start, end)
+        i += 1
+        if start and i < start:
+            continue
+        if end and i >= end:
+            break
+
         report.clinvar_total += 1
         if report.clinvar_total % 1000 == 0:
             logger.info(f'{report.clinvar_total} records processed')
@@ -208,7 +134,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
                     evidence_strings_generated += 1
                     if disease_mapped_efo_id is not None:
                         complete_evidence_strings_generated += 1
-                        report.used_trait_mappings.add((disease_name, disease_mapped_efo_id))
+                        report.used_trait_mappings.add(disease_mapped_efo_id)
 
             if complete_evidence_strings_generated == 1:
                 report.clinvar_done_one_complete_evidence_string += 1

diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py
@@ -0,0 +1,138 @@
+import logging
+import os
+from collections import Counter
+
+import yaml
+
+logger = logging.getLogger(__package__)
+
+# Add representers to enable safe-dump
+yaml.SafeDumper.add_representer(Counter,
+                                lambda dumper, data: dumper.represent_mapping('tag:yaml.org,2002:map', data.items()))
+
+UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'
+COUNTS_FILE_NAME = 'counts.yml'
+
+
+class Report:
+    """Holds counters and other records for a pipeline run."""
+
+    def __init__(self, trait_mappings=None, consequence_mappings=None):
+        # The main evidence string counter.
+        self.evidence_string_count = 0
+        # Complete evidence strings are ones with an EFO mapping.
+        self.complete_evidence_string_count = 0
+
+        # ClinVar record counters.
+        self.clinvar_total = 0
+        self.clinvar_fatal_no_valid_traits = 0
+        self.clinvar_skip_unsupported_variation = 0
+        self.clinvar_skip_no_functional_consequences = 0
+        self.clinvar_skip_missing_efo_mapping = 0
+        self.clinvar_skip_invalid_evidence_string = 0
+        self.clinvar_done_one_complete_evidence_string = 0
+        self.clinvar_done_multiple_complete_evidence_strings = 0
+        self.clinvar_fatal = 0
+        self.clinvar_skipped = 0
+        self.clinvar_done = 0
+
+        # Total number of trait-to-ontology mappings present in the database.
+        self.total_trait_mappings = 0
+        if trait_mappings:
+            self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
+        # All distinct (trait name, EFO ID) mappings used in the evidence strings.
+        self.used_trait_mappings = set()
+        # All unmapped trait names which prevented evidence string generation and their counts.
+        self.unmapped_trait_names = Counter()
+
+        # Variant-to-consequence mapping counts.
+        self.total_consequence_mappings = 0
+        if consequence_mappings:
+            self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
+        self.repeat_expansion_variants = 0
+        self.structural_variants = 0
+
+    def __eq__(self, other):
+        if not isinstance(other, Report):
+            return NotImplemented
+        return self.__dict__ == other.__dict__
+
+    def __add__(self, other):
+        if not isinstance(other, Report):
+            return NotImplemented
+        result = Report()
+        for var_name in vars(self).keys() | vars(other).keys():
+            if var_name == 'total_trait_mappings':
+                result.total_trait_mappings = max(self.total_trait_mappings, other.total_trait_mappings)
+            if var_name == 'total_consequence_mappings':
+                result.total_consequence_mappings = max(self.total_consequence_mappings,
+                                                        other.total_consequence_mappings)
+            if var_name == 'used_trait_mappings':
+                result.used_trait_mappings = self.used_trait_mappings | other.used_trait_mappings
+            else:
+                result.__setattr__(var_name, self.__getattribute__(var_name) + other.__getattribute__(var_name))
+        return result
+
+    def dump_to_file(self, dir_out, filename=COUNTS_FILE_NAME):
+        with open(os.path.join(dir_out, filename), 'w') as f:
+            yaml.safe_dump(vars(self), f)
+
+    def load_from_file(self, filename):
+        with open(filename, 'r') as f:
+            data = yaml.safe_load(f)
+            self.__dict__.update(**data)
+        # yaml loads a dict, convert to counter
+        self.unmapped_trait_names = Counter(self.unmapped_trait_names)
+
+    def compute_record_tallies(self):
+        """Compute tallies of records fatal/skipped/done based on the more granular counts."""
+        self.clinvar_fatal = self.clinvar_fatal_no_valid_traits
+        self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
+                                self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
+        self.clinvar_done = (self.clinvar_done_one_complete_evidence_string +
+                             self.clinvar_done_multiple_complete_evidence_strings)
+
+    def check_counts(self):
+        """Return True if counts are consistent, False otherwise."""
+        self.compute_record_tallies()
+        expected_total = self.clinvar_fatal + self.clinvar_skipped + self.clinvar_done
+        if expected_total != self.clinvar_total:
+            logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
+                         f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
+            return False
+        return True
+
+    def print_report(self):
+        """Print report of counts."""
+        self.compute_record_tallies()
+        report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
+            Total number of complete evidence strings generated\t{self.complete_evidence_string_count}
+
+            Total number of ClinVar records\t{self.clinvar_total}
+                Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
+                Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
+                    Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
+                    No functional consequences\t{self.clinvar_skip_no_functional_consequences}
+                    Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
+                    Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
+                Done: Generated at least one complete evidence string\t{self.clinvar_done}
+                    One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
+                    Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
+            Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
+        self.clinvar_done / (self.clinvar_skipped + self.clinvar_done):.1%}
+
+            Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
+                The number of distinct trait-to-ontology mappings used in the evidence strings\t{
+        len(self.used_trait_mappings)}
+            The number of distinct unmapped trait names which prevented complete evidence string generation\t{
+        len(self.unmapped_trait_names)}
+
+            Total number of variant to consequence mappings\t{self.total_consequence_mappings}
+                Number of repeat expansion variants\t{self.repeat_expansion_variants}
+                Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
+        print(report)
+
+    def write_unmapped_terms(self, dir_out):
+        with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
+            for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
+                unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')