Merge pull request #68 from oxfordmmm/feat/gvcf-support

Feat/gvcf support
oxfordmmm · Jun 19, 2024 · d06efd7 · d06efd7
2 parents 611dc8b + c78445c
commit d06efd7
Show file tree

Hide file tree

Showing 6 changed files with 300 additions and 4 deletions.
diff --git a/bin/gnomonicus b/bin/gnomonicus
@@ -106,6 +106,13 @@ if __name__ == "__main__":
         default=None,
         help="Path to a FASTA file to use for null rule adjudication. This returns matches for null-call rules in cases of corresponding `N` values in a FASTA file.",
     )
+    parser.add_argument(
+        "--min_dp",
+        type=int,
+        required=False,
+        default=None,
+        help="Minimum depth for a variant to be considered in the VCF. Below this value, rows are interpreted as null calls. Should allow for skipping FASTA adjudication in cases of low depth while adding coverage data.",
+    )
     options = parser.parse_args()
 
     options.output_dir = os.path.realpath(options.output_dir)
@@ -177,6 +184,7 @@ if __name__ == "__main__":
         options.vcf_file,
         ignore_filter=options.ignore_vcf_filter,
         minor_population_indices=minor_indices,
+        min_dp=options.min_dp,
     )
     sample = reference + vcf
     logging.debug("Applied the VCF to the reference")

diff --git a/bin/merge-vcfs b/bin/merge-vcfs
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Merge a minos VCF with a GVCF at certain positions (driven by the catalogue)."""
+import argparse
+import gumpy
+
+from vcf_subset import subset_vcf
+
+from pathlib import Path
+
+def fetch_minos_positions(minos_vcf: Path) -> set[int]:
+    """Given a minos VCF, return the positions to exclude from the gvcf.
+
+    Args:
+        minos_vcf (Path): Path to the minos VCF file.
+    Returns:
+        set[int]: The positions to exclude.
+    """
+    vcf = gumpy.VCFFile(minos_vcf.as_posix())
+    positions = set()
+    for pos, m_type in vcf.calls:
+        if m_type != "indel":
+            # Non-indels need no more checking as they are exactly in the VCF
+            positions.add(pos)
+        else:
+            # Check if the indel is a deletion or insertion (deleted bases need excluding too)
+            call, bases = vcf.calls[(pos, m_type)]["call"]
+            if call == "ins":
+                positions.add(pos)
+            else:
+                positions.update(range(pos, pos + len(bases)))
+    return positions
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--minos_vcf", help="The minos VCF filepath", required=True)
+    parser.add_argument("--gvcf", help="The GVCF filepath", required=True)
+    parser.add_argument("--resistant-positions", help="Path to list of resistant sites", required=True)
+    parser.add_argument("--output", help="The output VCF file path", required=True)
+    args = parser.parse_args()
+
+    # Sanity check that args are paths
+    minos_path = Path(args.minos_vcf)
+    gvcf_path = Path(args.gvcf)
+    resistant_positions_path = Path(args.resistant_positions)
+
+    output_path = Path(args.output)
+
+    # Sanity checking arguments
+    if not minos_path.exists() or not gvcf_path.exists() or not resistant_positions_path.exists():
+        raise FileNotFoundError("One or more of the input files does not exist!")
+    if output_path.exists():
+        raise FileExistsError("Output file already exists!")
+
+    # Read in the resistant positions
+    with open(resistant_positions_path) as f:
+        resistant_positions = set([int(line.strip()) for line in f])
+
+    minos_positions = fetch_minos_positions(minos_path)
+    to_fetch = sorted(list(resistant_positions - minos_positions))
+    print(f"Fetching {len(to_fetch)} positions from the GVCF")
+    # fetch_strs = set([str(pos) for pos in to_fetch])
+
+    # Pull out these positions from the GVCF
+    gvcf_headers, subset = subset_vcf(gvcf_path.as_posix(), to_fetch)
+
+    minos_headers, minos_values = subset_vcf(minos_path.as_posix(), sorted(list(minos_positions)))
+
+    # Pull out header parts to catch parts which need adding
+    minos_format = [header for header in minos_headers if "##FORMAT" in header]
+    minos_info = [header for header in minos_headers if "##INFO" in header]
+    minos_filter = [header for header in minos_headers if "##FILTER" in header]
+
+    gvcf_format = [header for header in gvcf_headers if "##FORMAT" in header]
+    gvcf_info = [header for header in gvcf_headers if "##INFO" in header]
+    gvcf_filter = [header for header in gvcf_headers if "##FILTER" in header]
+
+    missing_format = [header for header in gvcf_format if header not in minos_format]
+    missing_info = [header for header in gvcf_info if header not in minos_info]
+    missing_filter = [header for header in gvcf_filter if header not in minos_filter]
+
+
+    minos_misc_headers = [header for header in minos_headers if "##FORMAT" not in header and "##INFO" not in header and "##FILTER" not in header and "#CHROM" not in header]
+
+    chrom_line = [header for header in minos_headers if "#CHROM" in header][0]
+
+    with open(output_path, "w") as f:
+        for misc in minos_misc_headers:
+            f.write(misc + "\n")
+
+        # Merged format
+        for header in minos_format:
+            f.write(header + "\n")
+        for header in missing_format:
+            f.write(header + "\n")
+
+        # Merged info
+        for header in minos_info:
+            f.write(header + "\n")
+        for header in missing_info:
+            f.write(header + "\n")
+
+        # Merged filter
+        for header in minos_filter:
+            f.write(header + "\n")
+        for header in missing_filter:
+            f.write(header + "\n")
+
+        f.write(chrom_line + "\n")
+
+        # Minos rows
+        for row in minos_values:
+            f.write(row + "\n")
+
+        # GVCF rows
+        for row in subset:
+            # Replace DP with COV as GVCF doesn't have a COV-like field
+            if ":DP" in row:
+                row = row.replace(":DP", ":COV")
+            elif "\tDP:" in row:
+                row = row.replace("\tDP:", "\tCOV:")
+
+            row = row.split("\t")
+            # GVCF doesn't explicitly call filter passes, so ensure the calls are picked up
+            row[6] = "PASS" if row[6] == "." else row[6]
+            f.write("\t".join(row) + "\n")
+
+
+
+
+
+
+
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = gnomonicus
-version = 2.5.7
+version = 2.6.0
 author = Philip W Fowler, Jeremy Westhead
 author_email = philip.fowler@ndm.ox.ac.uk
 description = Python code to integrate results of tb-pipeline and provide an antibiogram, mutations and variants
@@ -16,14 +16,14 @@ package_dir =
 packages = find:
 python_requires = >=3.10
 install_requires = 
-    gumpy>=1.2.7
-    piezo>=0.8.2
+    gumpy>=1.3.0
+    piezo>=0.8.3
     numpy
     pandas
     recursive_diff
 zip_safe = False
 include_package_data = True
-scripts = bin/gnomonicus, bin/gbkToPkl, bin/gnomonicus-table-update
+scripts = bin/gnomonicus, bin/gbkToPkl, bin/gnomonicus-table-update, bin/merge-vcfs
 
 [options.package_data]
 mypkg = config/*.gbk

diff --git a/tests/test-cases/TEST-DNA-no-overlap.gbk.pkl b/tests/test-cases/TEST-DNA-no-overlap.gbk.pkl
diff --git a/tests/test-cases/TEST-DNA.gbk.pkl b/tests/test-cases/TEST-DNA.gbk.pkl
diff --git a/tests/unit/test_unit.py b/tests/unit/test_unit.py
@@ -2913,6 +2913,160 @@ def test_12():
     recursive_eq(ordered(expectedJSON), ordered(actualJSON))
 
 
+def test_13():
+    """Force null calls by passing a min_dp
+    Input:
+        NC_045512.2-S_E484K-minos.vcf
+    Expect output:
+        variants:    23012g>x
+        mutations:   E484X
+        predictions: {'AAA': 'F'}
+    """
+    # Setup
+    setupOutput("13")
+    reference = gnomonicus.loadGenome("tests/test-cases/NC_045512.2.gbk", False)
+    catalogue = piezo.ResistanceCatalogue(
+        "tests/test-cases/NC_045512.2-test-catalogue.csv", prediction_subset_only=True
+    )
+
+    vcf = gumpy.VCFFile(
+        "tests/test-cases/NC_045512.2-S_E484K-minos.vcf",
+        ignore_filter=True,
+        bypass_reference_calls=True,
+        # Ridiculously high min_dp to force null calls in pre-existing VCF
+        min_dp=45,
+    )
+    vcfStem = "NC_045512.2-S_E484K-minos"
+
+    sample = reference + vcf
+
+    diff = reference - sample
+
+    # Populate the tables
+    path = "tests/outputs/13/"
+    gnomonicus.populateVariants(vcfStem, path, diff, True, False)
+    mutations, referenceGenes, errors = gnomonicus.populateMutations(
+        vcfStem, path, diff, reference, sample, catalogue, True, False
+    )
+    e, phenotypes, _ = gnomonicus.populateEffects(
+        path, catalogue, mutations, referenceGenes, vcfStem, True, True
+    )
+
+    # Check for expected values within csvs
+    variants = pd.read_csv(path + f"{vcfStem}.variants.csv")
+    mutations = pd.read_csv(path + f"{vcfStem}.mutations.csv")
+    effects = pd.read_csv(path + f"{vcfStem}.effects.csv")
+    predictions = pd.read_csv(path + f"{vcfStem}.predictions.csv")
+
+    assert variants["variant"][0] == "23012g>x"
+
+    assert mutations["gene"][0] == "S"
+    assert mutations["mutation"][0] == "E484X"
+
+    assert "AAA" in effects["drug"].to_list()
+    assert effects["prediction"][effects["drug"].to_list().index("AAA")] == "F"
+
+    hits = []
+    for _, row in predictions.iterrows():
+        assert row["catalogue_name"] == "gnomonicus_test"
+        assert row["catalogue_version"] == "v1.0"
+        assert row["catalogue_values"] == "RFUS"
+        if row["drug"] == "AAA":
+            hits.append("AAA")
+            assert row["prediction"] == "F"
+        elif row["drug"] == "BBB":
+            hits.append("BBB")
+            assert row["prediction"] == "S"
+        else:
+            hits.append(None)
+    assert sorted(hits) == ["AAA", "BBB"]
+
+    gnomonicus.saveJSON(
+        variants,
+        mutations,
+        e,
+        phenotypes,
+        path,
+        vcfStem,
+        catalogue,
+        gnomonicus.__version__,
+        -1,
+        reference,
+        "",
+        "",
+        "",
+        {},
+    )
+
+    expectedJSON = {
+        "meta": {
+            "workflow_version": gnomonicus.__version__,
+            "guid": vcfStem,
+            "status": "success",
+            "workflow_name": "gnomonicus",
+            "workflow_task": "resistance_prediction",
+            "reference": "NC_045512",
+            "catalogue_type": "RFUS",
+            "catalogue_name": "gnomonicus_test",
+            "catalogue_version": "v1.0",
+        },
+        "data": {
+            "variants": [
+                {
+                    "variant": "23012g>x",
+                    "nucleotide_index": 23012,
+                    "gene_name": "S",
+                    "gene_position": 484,
+                    "codon_idx": 0,
+                    "vcf_evidence": {
+                        "GT": [1, 1],
+                        "DP": 44,
+                        "DPF": 0.991,
+                        "COV": [0, 44],
+                        "FRS": 1.0,
+                        "GT_CONF": 300.34,
+                        "GT_CONF_PERCENTILE": 54.73,
+                        "REF": "g",
+                        "ALTS": ["a"],
+                        "POS": 23012,
+                    },
+                    "vcf_idx": 1,
+                }
+            ],
+            "mutations": [
+                {
+                    "mutation": "E484X",
+                    "gene": "S",
+                    "gene_position": 484,
+                    "ref": "gaa",
+                    "alt": "xaa",
+                }
+            ],
+            "effects": {
+                "AAA": [
+                    {
+                        "gene": "S",
+                        "mutation": "E484X",
+                        "prediction": "F",
+                        "evidence": {"row": 2},
+                    },
+                    {"phenotype": "F"},
+                ],
+            },
+            "antibiogram": {"AAA": "F", "BBB": "S"},
+        },
+    }
+    expectedJSON = json.loads(json.dumps(expectedJSON, sort_keys=True))
+
+    actualJSON = prep_json(
+        json.load(open(os.path.join(path, f"{vcfStem}.gnomonicus-out.json"), "r"))
+    )
+
+    # assert == does work here, but gives ugly errors if mismatch
+    # Recursive_eq reports neat places they differ
+    recursive_eq(ordered(expectedJSON), ordered(actualJSON))
+
+
 def compare_effects(effects: pd.DataFrame, expected: [str]) -> None:
     """Compare an effects DataFrame with the expected values