From 32b8d54adf904d2d780c652477014b88f99b8aea Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Thu, 29 Feb 2024 15:32:11 +0000
Subject: [PATCH 1/5] chunk and parallelise evidence string generation

---
 bin/count_clinvar_rcvs.py                     | 13 ++++
 bin/evidence_string_generation.py             |  4 +-
 .../clinvar_to_evidence_strings.py            | 15 +++-
 pipelines/annotation_pipeline.nf              | 74 ++++++++++++++++---
 .../test_clinvar_to_evidence_strings.py       |  1 -
 tests/pipelines/test_annotation_pipeline.sh   |  3 +-
 6 files changed, 92 insertions(+), 18 deletions(-)
 create mode 100644 bin/count_clinvar_rcvs.py

diff --git a/bin/count_clinvar_rcvs.py b/bin/count_clinvar_rcvs.py
new file mode 100644
index 00000000..13b34ae5
--- /dev/null
+++ b/bin/count_clinvar_rcvs.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+import argparse
+
+from cmat.clinvar_xml_io import ClinVarDataset
+
+parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout')
+parser.add_argument('--clinvar-xml',  help='ClinVar XML release', required=True)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print(sum(1 for _ in ClinVarDataset(args.clinvar_xml)))
diff --git a/bin/evidence_string_generation.py b/bin/evidence_string_generation.py
index 6b6aabf4..28893942 100755
--- a/bin/evidence_string_generation.py
+++ b/bin/evidence_string_generation.py
@@ -9,10 +9,12 @@
 parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
 parser.add_argument('--ot-schema',    help='OpenTargets schema JSON',                required=True)
 parser.add_argument('--out',          help='Output directory',                       required=True)
+parser.add_argument('--start',        help='Start index (inclusive)',                required=False, type=int)
+parser.add_argument('--end',          help='End index (exclusive)',                  required=False, type=int)
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     clinvar_to_evidence_strings.launch_pipeline(
         clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
-        ot_schema_file=args.ot_schema, dir_out=args.out)
+        ot_schema_file=args.ot_schema, dir_out=args.out, start=args.start, end=args.end)
diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
index 74ffd48f..06ea3e0c 100644
--- a/cmat/output_generation/clinvar_to_evidence_strings.py
+++ b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -60,6 +60,7 @@ def __init__(self, trait_mappings, consequence_mappings):
 
     def print_report_and_check_counts(self):
         """Print report of counts and return True if counts are consistent, False otherwise."""
+        # TODO dump to a file so counts can be aggregated
         # ClinVar tallies.
         clinvar_fatal = self.clinvar_fatal_no_valid_traits
         clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
@@ -122,14 +123,14 @@ def validate_evidence_string(ev_string, ot_schema_contents):
         sys.exit(1)
 
 
-def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
+def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, start, end):
     os.makedirs(dir_out, exist_ok=True)
     string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file)
     variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
 
     report, exception_raised = clinvar_to_evidence_strings(
         string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
-        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
+        output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end)
     counts_consistent = report.print_report_and_check_counts()
     report.write_unmapped_terms(dir_out)
     if exception_raised or not counts_consistent:
@@ -137,14 +138,22 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc
 
 
 def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
-                                output_evidence_strings):
+                                output_evidence_strings, start=None, end=None):
     report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
     ot_schema_contents = json.loads(open(ot_schema).read())
     output_evidence_strings_file = open(output_evidence_strings, 'wt')
     exception_raised = False
 
     logger.info('Processing ClinVar records')
+    i = -1
     for clinvar_record in ClinVarDataset(clinvar_xml):
+        # If start & end provided, only process records in the range [start, end)
+        i += 1
+        if start and i < start:
+            continue
+        if end and i >= end:
+            break
+
         report.clinvar_total += 1
         if report.clinvar_total % 1000 == 0:
             logger.info(f'{report.clinvar_total} records processed')
diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf
index c7b5411b..e65519d5 100644
--- a/pipelines/annotation_pipeline.nf
+++ b/pipelines/annotation_pipeline.nf
@@ -34,6 +34,8 @@ if (!params.output_dir) {
 batchRoot = params.output_dir
 codeRoot = "${projectDir}/.."
 includeTranscriptsFlag = params.include_transcripts ? "--include-transcripts" : ""
+// Number of chunks to run in parallel for evidence generation
+numChunks = 10
 
 /*
  * Main workflow.
@@ -56,11 +58,23 @@ workflow {
     if (params.schema != null) {
         // Open Targets evidence string output
         downloadJsonSchema()
+        // Get start/end indices to break XML into chunks
+        countClinvarRecords(clinvarXml)
+        .map { strN ->
+            n = Integer.parseInt(strN.trim())
+            step = (Integer)Math.ceil(n / numChunks)
+            startIndices = (0..n-1).step(step)
+            endIndices = startIndices[1..-1] + [n]
+            [startIndices, endIndices].transpose()
+        }
+        .set { startEndPairs }
+        // Generate evidence for each chunk and concatenate
         generateEvidence(clinvarXml,
                          downloadJsonSchema.out.jsonSchema,
-                         combineConsequences.out.consequencesCombined)
-
-        checkDuplicates(generateEvidence.out.evidenceStrings)
+                         combineConsequences.out.consequencesCombined,
+                         startEndPairs.collect())
+        collectEvidenceStrings(generateEvidence.out.evidenceStrings.collect())
+        checkDuplicates(collectEvidenceStrings.out.evidenceStrings)
         convertXrefs(clinvarXml)
 
     } else {
@@ -308,25 +322,37 @@ process generateAnnotatedXml {
     """
 }
 
+/*
+ * Count number of RCV records in ClinVar.
+ */
+process countClinvarRecords {
+    input:
+    path clinvarXml
+
+    output:
+    stdout emit: count
+
+    script:
+    """
+    \${PYTHON_BIN} ${codeRoot}/bin/count_clinvar_rcvs.py --clinvar-xml ${clinvarXml}
+    """
+}
+
 /*
  * Generate the evidence strings for submission to Open Targets.
  */
 process generateEvidence {
-    clusterOptions "-o ${batchRoot}/logs/evidence_string_generation.out \
-                    -e ${batchRoot}/logs/evidence_string_generation.err"
-
-    publishDir "${batchRoot}/evidence_strings",
-        overwrite: true,
-        mode: "copy",
-        pattern: "*.json"
+    clusterOptions "-o ${batchRoot}/logs/evidence_string_generation_${startEnd[0]}.out \
+                    -e ${batchRoot}/logs/evidence_string_generation_${startEnd[0]}.err"
 
     input:
     path clinvarXml
     path jsonSchema
     path consequenceMappings
+    each startEnd
 
     output:
-    path "evidence_strings.json", emit: evidenceStrings
+    path "evidence_strings_*.json", emit: evidenceStrings
 
     script:
     """
@@ -335,7 +361,31 @@ process generateEvidence {
         --efo-mapping ${params.mappings} \
         --gene-mapping ${consequenceMappings} \
         --ot-schema ${jsonSchema} \
-        --out .
+        --out . \
+        --start ${startEnd[0]} \
+        --end ${startEnd[1]}
+    mv evidence_strings.json evidence_strings_${startEnd[0]}.json
+    """
+}
+
+/*
+ * Concatenate evidence strings into a single file.
+ */
+process collectEvidenceStrings {
+    publishDir "${batchRoot}/evidence_strings",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.json"
+
+    input:
+    path "evidence_strings_*.json"
+
+    output:
+    path "evidence_strings.json", emit: evidenceStrings
+
+    script:
+    """
+    cat evidence_strings_*.json >> evidence_strings.json
     """
 }
 
diff --git a/tests/output_generation/test_clinvar_to_evidence_strings.py b/tests/output_generation/test_clinvar_to_evidence_strings.py
index d99bf7e2..940f3738 100644
--- a/tests/output_generation/test_clinvar_to_evidence_strings.py
+++ b/tests/output_generation/test_clinvar_to_evidence_strings.py
@@ -1,6 +1,5 @@
 import json
 import os
-import re
 
 import requests
 import xml.etree.ElementTree as ElementTree
diff --git a/tests/pipelines/test_annotation_pipeline.sh b/tests/pipelines/test_annotation_pipeline.sh
index af93a7e7..3cded4df 100644
--- a/tests/pipelines/test_annotation_pipeline.sh
+++ b/tests/pipelines/test_annotation_pipeline.sh
@@ -22,7 +22,8 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
 diff ${BATCH_ROOT}/gene_mapping/consequences_snp.tsv ${BATCH_ROOT_BASE}/expected/consequences_snp.tsv
 diff ${BATCH_ROOT}/gene_mapping/consequences_repeat.tsv ${BATCH_ROOT_BASE}/expected/consequences_repeat.tsv
 diff ${BATCH_ROOT}/gene_mapping/consequences_structural.tsv ${BATCH_ROOT_BASE}/expected/consequences_structural.tsv
-diff ${BATCH_ROOT}/evidence_strings/evidence_strings.json ${BATCH_ROOT_BASE}/expected/evidence_strings.json
+# Nextflow parallelism means ordering might not be stable
+diff <(sort ${BATCH_ROOT}/evidence_strings/evidence_strings.json) <(sort ${BATCH_ROOT_BASE}/expected/evidence_strings.json)
 
 cd ${CWD}
 rm -r ${BATCH_ROOT}

From 089ad47048b10682d23f4cd9d2caa10bed8219ff Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 4 Mar 2024 13:21:45 +0000
Subject: [PATCH 2/5] output and aggregate counts for evidence strings

---
 bin/aggregate_counts.py                       |  29 ++++
 bin/count_clinvar_rcvs.py                     |   2 +-
 .../clinvar_to_evidence_strings.py            |  95 +------------
 cmat/output_generation/report.py              | 133 ++++++++++++++++++
 pipelines/annotation_pipeline.nf              |  24 ++++
 5 files changed, 193 insertions(+), 90 deletions(-)
 create mode 100644 bin/aggregate_counts.py
 create mode 100644 cmat/output_generation/report.py

diff --git a/bin/aggregate_counts.py b/bin/aggregate_counts.py
new file mode 100644
index 00000000..894dfa10
--- /dev/null
+++ b/bin/aggregate_counts.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import argparse
+import glob
+
+from cmat.output_generation.report import Report
+
+parser = argparse.ArgumentParser('Aggregate counts reports')
+parser.add_argument('--counts-yml', nargs='+', help='YAML files containing intermediate counts', required=True)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+
+    # Load all the reports
+    filenames = [f for files in args.counts_yml for f in glob.glob(files)]
+    reports = []
+    for filename in filenames:
+        r = Report()
+        r.load_from_file(filename)
+        reports.append(r)
+
+    # Sum them up and output the results
+    complete_report = sum(reports, start=Report())
+    complete_report.print_report()
+    complete_report.dump_to_file(dir_out='.')
+    complete_report.write_unmapped_terms(dir_out='.')
+    if not complete_report.check_counts():
+        raise RuntimeError('Aggregate counts not consistent')
diff --git a/bin/count_clinvar_rcvs.py b/bin/count_clinvar_rcvs.py
index 13b34ae5..b11a8679 100644
--- a/bin/count_clinvar_rcvs.py
+++ b/bin/count_clinvar_rcvs.py
@@ -5,7 +5,7 @@
 from cmat.clinvar_xml_io import ClinVarDataset
 
 parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout')
-parser.add_argument('--clinvar-xml',  help='ClinVar XML release', required=True)
+parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True)
 
 
 if __name__ == '__main__':
diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
index 06ea3e0c..5d441702 100644
--- a/cmat/output_generation/clinvar_to_evidence_strings.py
+++ b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -4,12 +4,13 @@
 import re
 import sys
 import os
-from collections import defaultdict, Counter
+from collections import defaultdict
 
 import jsonschema
 
 from cmat.clinvar_xml_io import ClinVarDataset
 from cmat.output_generation import consequence_type as CT
+from cmat.output_generation.report import Report
 
 logger = logging.getLogger(__package__)
 
@@ -22,91 +23,6 @@
 # Output file names.
 EVIDENCE_STRINGS_FILE_NAME = 'evidence_strings.json'
 EVIDENCE_RECORDS_FILE_NAME = 'evidence_records.tsv'
-UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'
-
-
-class Report:
-    """Holds counters and other records for a pipeline run."""
-
-    def __init__(self, trait_mappings, consequence_mappings):
-        self.report_strings = []
-
-        # The main evidence string counter.
-        self.evidence_string_count = 0
-        # Complete evidence strings are ones with an EFO mapping.
-        self.complete_evidence_string_count = 0
-
-        # ClinVar record counters.
-        self.clinvar_total = 0
-        self.clinvar_fatal_no_valid_traits = 0
-        self.clinvar_skip_unsupported_variation = 0
-        self.clinvar_skip_no_functional_consequences = 0
-        self.clinvar_skip_missing_efo_mapping = 0
-        self.clinvar_skip_invalid_evidence_string = 0
-        self.clinvar_done_one_complete_evidence_string = 0
-        self.clinvar_done_multiple_complete_evidence_strings = 0
-
-        # Total number of trait-to-ontology mappings present in the database.
-        self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
-        # All distinct (trait name, EFO ID) mappings used in the evidence strings.
-        self.used_trait_mappings = set()
-        # All unmapped trait names which prevented evidence string generation and their counts.
-        self.unmapped_trait_names = Counter()
-
-        # Variant-to-consequence mapping counts.
-        self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
-        self.repeat_expansion_variants = 0
-        self.structural_variants = 0
-
-    def print_report_and_check_counts(self):
-        """Print report of counts and return True if counts are consistent, False otherwise."""
-        # TODO dump to a file so counts can be aggregated
-        # ClinVar tallies.
-        clinvar_fatal = self.clinvar_fatal_no_valid_traits
-        clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
-                           self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
-        clinvar_done = (self.clinvar_done_one_complete_evidence_string +
-                        self.clinvar_done_multiple_complete_evidence_strings)
-
-        report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
-            Total number of complete evidence strings generated\t{self.complete_evidence_string_count}
-
-            Total number of ClinVar records\t{self.clinvar_total}
-                Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
-                Skipped: Can be rescued by future improvements\t{clinvar_skipped}
-                    Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
-                    No functional consequences\t{self.clinvar_skip_no_functional_consequences}
-                    Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
-                    Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
-                Done: Generated at least one complete evidence string\t{clinvar_done}
-                    One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
-                    Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
-            Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
-                clinvar_done / (clinvar_skipped + clinvar_done):.1%}
-
-            Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
-                The number of distinct trait-to-ontology mappings used in the evidence strings\t{
-                    len(self.used_trait_mappings)}
-            The number of distinct unmapped trait names which prevented complete evidence string generation\t{
-                len(self.unmapped_trait_names)}
-
-            Total number of variant to consequence mappings\t{self.total_consequence_mappings}
-                Number of repeat expansion variants\t{self.repeat_expansion_variants}
-                Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
-        print(report)
-
-        # Confirm counts as expected, exit with error if not.
-        expected_total = clinvar_fatal + clinvar_skipped + clinvar_done
-        if expected_total != self.clinvar_total:
-            logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
-                         f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
-            return False
-        return True
-
-    def write_unmapped_terms(self, dir_out):
-        with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
-            for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
-                unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')
 
 
 def validate_evidence_string(ev_string, ot_schema_contents):
@@ -131,8 +47,9 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc
     report, exception_raised = clinvar_to_evidence_strings(
         string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
         output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end)
-    counts_consistent = report.print_report_and_check_counts()
-    report.write_unmapped_terms(dir_out)
+    counts_consistent = report.check_counts()
+    report.print_report()
+    report.dump_to_file(dir_out)
     if exception_raised or not counts_consistent:
         sys.exit(1)
 
@@ -217,7 +134,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
                     evidence_strings_generated += 1
                     if disease_mapped_efo_id is not None:
                         complete_evidence_strings_generated += 1
-                        report.used_trait_mappings.add((disease_name, disease_mapped_efo_id))
+                        report.used_trait_mappings.add(disease_mapped_efo_id)
 
             if complete_evidence_strings_generated == 1:
                 report.clinvar_done_one_complete_evidence_string += 1
diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py
new file mode 100644
index 00000000..a5b0093b
--- /dev/null
+++ b/cmat/output_generation/report.py
@@ -0,0 +1,133 @@
+import logging
+import os
+from collections import Counter
+
+import yaml
+
+logger = logging.getLogger(__package__)
+
+# Add representers to enable safe-dump
+yaml.SafeDumper.add_representer(Counter,
+                                lambda dumper, data: dumper.represent_mapping('tag:yaml.org,2002:map', data.items()))
+
+UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'
+COUNTS_FILE_NAME = 'counts.yml'
+
+
+class Report:
+    """Holds counters and other records for a pipeline run."""
+
+    def __init__(self, trait_mappings=None, consequence_mappings=None):
+        # The main evidence string counter.
+        self.evidence_string_count = 0
+        # Complete evidence strings are ones with an EFO mapping.
+        self.complete_evidence_string_count = 0
+
+        # ClinVar record counters.
+        self.clinvar_total = 0
+        self.clinvar_fatal_no_valid_traits = 0
+        self.clinvar_skip_unsupported_variation = 0
+        self.clinvar_skip_no_functional_consequences = 0
+        self.clinvar_skip_missing_efo_mapping = 0
+        self.clinvar_skip_invalid_evidence_string = 0
+        self.clinvar_done_one_complete_evidence_string = 0
+        self.clinvar_done_multiple_complete_evidence_strings = 0
+        self.clinvar_fatal = 0
+        self.clinvar_skipped = 0
+        self.clinvar_done = 0
+
+        # Total number of trait-to-ontology mappings present in the database.
+        self.total_trait_mappings = 0
+        if trait_mappings:
+            self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
+        # All distinct (trait name, EFO ID) mappings used in the evidence strings.
+        self.used_trait_mappings = set()
+        # All unmapped trait names which prevented evidence string generation and their counts.
+        self.unmapped_trait_names = Counter()
+
+        # Variant-to-consequence mapping counts.
+        self.total_consequence_mappings = 0
+        if consequence_mappings:
+            self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
+        self.repeat_expansion_variants = 0
+        self.structural_variants = 0
+
+    def __add__(self, other):
+        if not isinstance(other, Report):
+            raise ValueError('Both operands must be Report objects')
+        result = Report()
+        for var_name in vars(self).keys() | vars(other).keys():
+            if var_name == 'total_trait_mappings':
+                result.total_trait_mappings = max(self.total_trait_mappings, other.total_trait_mappings)
+            if var_name == 'total_consequence_mappings':
+                result.total_consequence_mappings = max(self.total_consequence_mappings,
+                                                        other.total_consequence_mappings)
+            if var_name == 'used_trait_mappings':
+                result.used_trait_mappings = self.used_trait_mappings | other.used_trait_mappings
+            else:
+                result.__setattr__(var_name, self.__getattribute__(var_name) + other.__getattribute__(var_name))
+        return result
+
+    def dump_to_file(self, dir_out, filename=COUNTS_FILE_NAME):
+        with open(os.path.join(dir_out, filename), 'w') as f:
+            yaml.safe_dump(vars(self), f)
+
+    def load_from_file(self, filename):
+        with open(filename, 'r') as f:
+            data = yaml.safe_load(f)
+            self.__dict__.update(**data)
+        # yaml loads a dict, convert to counter
+        self.unmapped_trait_names = Counter(self.unmapped_trait_names)
+
+    def compute_record_tallies(self):
+        """Compute tallies of records fatal/skipped/done based on the more granular counts."""
+        self.clinvar_fatal = self.clinvar_fatal_no_valid_traits
+        self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
+                                self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
+        self.clinvar_done = (self.clinvar_done_one_complete_evidence_string +
+                             self.clinvar_done_multiple_complete_evidence_strings)
+
+    def check_counts(self):
+        """Return True if counts are consistent, False otherwise."""
+        self.compute_record_tallies()
+        expected_total = self.clinvar_fatal + self.clinvar_skipped + self.clinvar_done
+        if expected_total != self.clinvar_total:
+            logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
+                         f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
+            return False
+        return True
+
+    def print_report(self):
+        """Print report of counts."""
+        self.compute_record_tallies()
+        report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
+            Total number of complete evidence strings generated\t{self.complete_evidence_string_count}
+
+            Total number of ClinVar records\t{self.clinvar_total}
+                Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
+                Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
+                    Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
+                    No functional consequences\t{self.clinvar_skip_no_functional_consequences}
+                    Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
+                    Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
+                Done: Generated at least one complete evidence string\t{self.clinvar_done}
+                    One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
+                    Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
+            Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
+        self.clinvar_done / (self.clinvar_skipped + self.clinvar_done):.1%}
+
+            Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
+                The number of distinct trait-to-ontology mappings used in the evidence strings\t{
+        len(self.used_trait_mappings)}
+            The number of distinct unmapped trait names which prevented complete evidence string generation\t{
+        len(self.unmapped_trait_names)}
+
+            Total number of variant to consequence mappings\t{self.total_consequence_mappings}
+                Number of repeat expansion variants\t{self.repeat_expansion_variants}
+                Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
+        print(report)
+
+    def write_unmapped_terms(self, dir_out):
+        with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
+            for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
+                unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')
diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf
index e65519d5..faab7442 100644
--- a/pipelines/annotation_pipeline.nf
+++ b/pipelines/annotation_pipeline.nf
@@ -74,6 +74,7 @@ workflow {
                          combineConsequences.out.consequencesCombined,
                          startEndPairs.collect())
         collectEvidenceStrings(generateEvidence.out.evidenceStrings.collect())
+        collectCounts(generateEvidence.out.countsYml.collect())
         checkDuplicates(collectEvidenceStrings.out.evidenceStrings)
         convertXrefs(clinvarXml)
 
@@ -353,6 +354,7 @@ process generateEvidence {
 
     output:
     path "evidence_strings_*.json", emit: evidenceStrings
+    path "counts_*.yml", emit: countsYml
 
     script:
     """
@@ -365,6 +367,7 @@ process generateEvidence {
         --start ${startEnd[0]} \
         --end ${startEnd[1]}
     mv evidence_strings.json evidence_strings_${startEnd[0]}.json
+    mv counts.yml counts_${startEnd[0]}.yml
     """
 }
 
@@ -389,6 +392,27 @@ process collectEvidenceStrings {
     """
 }
 
+/*
+ * Aggregate counts into a single file and print the report.
+ */
+process collectCounts {
+    publishDir "${batchRoot}/logs",
+        overwrite: true,
+        mode: "copy",
+        pattern: "*.yml"
+
+    input:
+    path "counts_*.yml"
+
+    output:
+    path "counts.yml", emit: countsYml
+
+    script:
+    """
+    \${PYTHON_BIN} ${codeRoot}/bin/aggregate_counts.py --counts-yml counts_*.yml
+    """
+}
+
 /*
  * Check that the generated evidence strings do not contain any duplicated evidence strings.
  */

From 6d6e45005d687210b2cf8858d53b9899afd53eff Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 4 Mar 2024 14:52:16 +0000
Subject: [PATCH 3/5] add tests and cleanup nextflow

---
 cmat/output_generation/report.py       |  7 +++-
 pipelines/annotation_pipeline.nf       |  6 +--
 requirements.txt                       |  1 +
 tests/output_generation/test_report.py | 54 ++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 5 deletions(-)
 create mode 100644 tests/output_generation/test_report.py

diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py
index a5b0093b..55d03b50 100644
--- a/cmat/output_generation/report.py
+++ b/cmat/output_generation/report.py
@@ -52,9 +52,14 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
         self.repeat_expansion_variants = 0
         self.structural_variants = 0
 
+    def __eq__(self, other):
+        if not isinstance(other, Report):
+            return NotImplemented
+        return self.__dict__ == other.__dict__
+
     def __add__(self, other):
         if not isinstance(other, Report):
-            raise ValueError('Both operands must be Report objects')
+            return NotImplemented
         result = Report()
         for var_name in vars(self).keys() | vars(other).keys():
             if var_name == 'total_trait_mappings':
diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf
index faab7442..28454081 100644
--- a/pipelines/annotation_pipeline.nf
+++ b/pipelines/annotation_pipeline.nf
@@ -353,8 +353,8 @@ process generateEvidence {
     each startEnd
 
     output:
-    path "evidence_strings_*.json", emit: evidenceStrings
-    path "counts_*.yml", emit: countsYml
+    path "evidence_strings.json", emit: evidenceStrings
+    path "counts.yml", emit: countsYml
 
     script:
     """
@@ -366,8 +366,6 @@ process generateEvidence {
         --out . \
         --start ${startEnd[0]} \
         --end ${startEnd[1]}
-    mv evidence_strings.json evidence_strings_${startEnd[0]}.json
-    mv counts.yml counts_${startEnd[0]}.yml
     """
 }
 
diff --git a/requirements.txt b/requirements.txt
index 99f5a47c..3694df5e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ numpy==1.24.3
 pandas==1.5.3
 pytest==7.2.2
 pytest-cov==2.10.0
+pyyaml==6.0.1
 requests==2.31.0
 requests-mock==1.8.0
 retry==0.9.2
diff --git a/tests/output_generation/test_report.py b/tests/output_generation/test_report.py
new file mode 100644
index 00000000..661fdb35
--- /dev/null
+++ b/tests/output_generation/test_report.py
@@ -0,0 +1,54 @@
+import os
+from collections import Counter
+
+from cmat.output_generation.report import Report
+
+
+resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
+
+
+def get_test_report():
+    report = Report()
+    report.evidence_string_count = 42
+    report.used_trait_mappings = {'one', 'two', 'three'}
+    report.unmapped_trait_names = Counter('abracadabra')
+    return report
+
+
+def test_dump_and_load():
+    counts_file = os.path.join(resources_dir, 'counts.yml')
+    original_report = get_test_report()
+    original_report.dump_to_file(resources_dir)
+
+    loaded_report = Report()
+    loaded_report.load_from_file(counts_file)
+    assert original_report == loaded_report
+
+    if os.path.exists(counts_file):
+        os.remove(counts_file)
+
+
+def test_add_operator():
+    report_1 = get_test_report()
+
+    report_2 = Report()
+    report_2.evidence_string_count = 13
+    report_2.total_trait_mappings = 100
+    report_2.used_trait_mappings = {'three', 'four'}
+    report_2.unmapped_trait_names = Counter('alakazam')
+
+    combined = report_1 + report_2
+    assert combined.clinvar_total == 0
+    assert combined.evidence_string_count == 55
+    assert combined.total_trait_mappings == 100
+    assert combined.used_trait_mappings == {'one', 'two', 'three', 'four'}
+    assert combined.unmapped_trait_names == Counter(
+        {'a': 9, 'b': 2, 'r': 2, 'c': 1, 'd': 1, 'l': 1, 'k': 1, 'z': 1, 'm': 1}
+    )
+
+
+def test_check_counts():
+    report = get_test_report()
+    assert report.check_counts()
+    report.clinvar_total = 5
+    assert not report.check_counts()

From ddd9eaa9490c8c50edc26ec0ca9be88946cd19ef Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 6 Mar 2024 09:23:29 +0000
Subject: [PATCH 4/5] adjust counts to appropriately skip invalid clinical
 significance values

---
 cmat/clinvar_xml_io/clinvar_record.py         |  6 +++++
 .../clinvar_to_evidence_strings.py            | 26 ++++++++++++-------
 cmat/output_generation/report.py              |  4 ++-
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py
index da653db8..aca234e3 100644
--- a/cmat/clinvar_xml_io/clinvar_record.py
+++ b/cmat/clinvar_xml_io/clinvar_record.py
@@ -34,6 +34,8 @@ class ClinVarRecord:
 
     # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
     NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
+    # Some records have been flagged by ClinVar and should not be used.
+    INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'}
 
     def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
         """Initialise a ClinVar record object from an RCV XML record."""
@@ -143,6 +145,10 @@ def clinical_significance_list(self):
         See /data-exploration/clinvar-variant-types/README.md for further explanation."""
         return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))
 
+    @property
+    def valid_clinical_significances(self):
+        return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES]
+
     @property
     def allele_origins(self):
         return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
index 5d441702..b1cb25c2 100644
--- a/cmat/output_generation/clinvar_to_evidence_strings.py
+++ b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -82,20 +82,25 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
             if not clinvar_record.traits_with_valid_names:
                 report.clinvar_fatal_no_valid_traits += 1
                 continue
+            # Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
+            # submissions being flagged.
+            if not clinvar_record.valid_clinical_significances:
+                report.clinvar_fatal_no_clinical_significance += 1
+                continue
 
-            # Failure mode 2 (skip). A ClinVar record contains an unsupported variation type.
+            # Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
             if clinvar_record.measure is None:
                 report.clinvar_skip_unsupported_variation += 1
                 continue
 
-            # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
-            # origins, (2) EFO mappings, and (3) genes where the variant has effect.
+            # Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid
+            # allele origins, (2) EFO mappings, and (3) genes where the variant has effect.
             grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
             consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
             grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
                                                              string_to_efo_mappings)
 
-            # Failure mode 3 (skip). No functional consequences are available.
+            # Failure mode 4 (skip). No functional consequences are available.
             if not consequence_types:
                 report.clinvar_skip_no_functional_consequences += 1
                 continue
@@ -106,9 +111,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
             if is_structural_variant(clinvar_record.measure):
                 report.structural_variants += len(consequence_types)
 
-            # Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable
-            # EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a
-            # failure so we can continue to measure mapping coverage.
+            # Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
+            # suitable EFO mappings were found in the database. This will still generate an evidence string, but is
+            # tracked as a failure so we can continue to measure mapping coverage.
             if not any(group[-1] for group in grouped_diseases):
                 report.clinvar_skip_missing_efo_mapping += 1
                 unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name
@@ -122,8 +127,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
             for allele_origins, disease_attributes, consequence_attributes in itertools.product(
                     grouped_allele_origins, grouped_diseases, consequence_types):
                 disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
-                evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
-                                                           disease_mapped_efo_id, consequence_attributes)
+                evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name,
+                                                           disease_source_id, disease_mapped_efo_id,
+                                                           consequence_attributes)
 
                 # Validate and immediately output the evidence string (not keeping everything in memory).
                 is_valid = validate_evidence_string(evidence_string, ot_schema_contents)
@@ -185,7 +191,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
         'allelicRequirements': clinvar_record.mode_of_inheritance,
 
         # Levels of clinical significance reported for the variant.
-        'clinicalSignificances': clinvar_record.clinical_significance_list,
+        'clinicalSignificances': clinvar_record.valid_clinical_significances,
 
         # Confidence (review status).
         'confidence': clinvar_record.review_status,
diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py
index 55d03b50..fcd37742 100644
--- a/cmat/output_generation/report.py
+++ b/cmat/output_generation/report.py
@@ -26,6 +26,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
         # ClinVar record counters.
         self.clinvar_total = 0
         self.clinvar_fatal_no_valid_traits = 0
+        self.clinvar_fatal_no_clinical_significance = 0
         self.clinvar_skip_unsupported_variation = 0
         self.clinvar_skip_no_functional_consequences = 0
         self.clinvar_skip_missing_efo_mapping = 0
@@ -86,7 +87,7 @@ def load_from_file(self, filename):
 
     def compute_record_tallies(self):
         """Compute tallies of records fatal/skipped/done based on the more granular counts."""
-        self.clinvar_fatal = self.clinvar_fatal_no_valid_traits
+        self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance
         self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
                                 self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
         self.clinvar_done = (self.clinvar_done_one_complete_evidence_string +
@@ -110,6 +111,7 @@ def print_report(self):
 
             Total number of ClinVar records\t{self.clinvar_total}
                 Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
+                    No clinical significance\t{self.clinvar_fatal_no_clinical_significance}
                 Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
                     Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
                     No functional consequences\t{self.clinvar_skip_no_functional_consequences}

From 0668213c85e09514755bdd1ff2f64bced1418c37 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 6 Mar 2024 16:23:08 +0000
Subject: [PATCH 5/5] bugfix for summing reports

---
 cmat/output_generation/report.py       | 4 ++--
 tests/output_generation/test_report.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmat/output_generation/report.py b/cmat/output_generation/report.py
index fcd37742..f7b404c7 100644
--- a/cmat/output_generation/report.py
+++ b/cmat/output_generation/report.py
@@ -65,10 +65,10 @@ def __add__(self, other):
         for var_name in vars(self).keys() | vars(other).keys():
             if var_name == 'total_trait_mappings':
                 result.total_trait_mappings = max(self.total_trait_mappings, other.total_trait_mappings)
-            if var_name == 'total_consequence_mappings':
+            elif var_name == 'total_consequence_mappings':
                 result.total_consequence_mappings = max(self.total_consequence_mappings,
                                                         other.total_consequence_mappings)
-            if var_name == 'used_trait_mappings':
+            elif var_name == 'used_trait_mappings':
                 result.used_trait_mappings = self.used_trait_mappings | other.used_trait_mappings
             else:
                 result.__setattr__(var_name, self.__getattribute__(var_name) + other.__getattribute__(var_name))
diff --git a/tests/output_generation/test_report.py b/tests/output_generation/test_report.py
index 661fdb35..f356ecce 100644
--- a/tests/output_generation/test_report.py
+++ b/tests/output_generation/test_report.py
@@ -30,6 +30,7 @@ def test_dump_and_load():
 
 def test_add_operator():
     report_1 = get_test_report()
+    report_1.total_trait_mappings = 100
 
     report_2 = Report()
     report_2.evidence_string_count = 13