Skip to content

Commit

Permalink
Merge pull request #68 from oxfordmmm/feat/gvcf-support
Browse files Browse the repository at this point in the history
Feat/gvcf support
  • Loading branch information
JeremyWesthead authored Jun 19, 2024
2 parents 611dc8b + c78445c commit d06efd7
Show file tree
Hide file tree
Showing 6 changed files with 300 additions and 4 deletions.
8 changes: 8 additions & 0 deletions bin/gnomonicus
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ if __name__ == "__main__":
default=None,
help="Path to a FASTA file to use for null rule adjudication. This returns matches for null-call rules in cases of corresponding `N` values in a FASTA file.",
)
parser.add_argument(
"--min_dp",
type=int,
required=False,
default=None,
help="Minimum depth for a variant to be considered in the VCF. Below this value, rows are interpreted as null calls. Should allow for skipping FASTA adjudication in cases of low depth while adding coverage data.",
)
options = parser.parse_args()

options.output_dir = os.path.realpath(options.output_dir)
Expand Down Expand Up @@ -177,6 +184,7 @@ if __name__ == "__main__":
options.vcf_file,
ignore_filter=options.ignore_vcf_filter,
minor_population_indices=minor_indices,
min_dp=options.min_dp,
)
sample = reference + vcf
logging.debug("Applied the VCF to the reference")
Expand Down
134 changes: 134 additions & 0 deletions bin/merge-vcfs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""Merge a minos VCF with a GVCF at certain positions (driven by the catalogue)."""
import argparse
import gumpy

from vcf_subset import subset_vcf

from pathlib import Path

def fetch_minos_positions(minos_vcf: Path) -> set[int]:
"""Given a minos VCF, return the positions to exclude from the gvcf.
Args:
minos_vcf (Path): Path to the minos VCF file.
Returns:
set[int]: The positions to exclude.
"""
vcf = gumpy.VCFFile(minos_vcf.as_posix())
positions = set()
for pos, m_type in vcf.calls:
if m_type != "indel":
# Non-indels need no more checking as they are exactly in the VCF
positions.add(pos)
else:
# Check if the indel is a deletion or insertion (deleted bases need excluding too)
call, bases = vcf.calls[(pos, m_type)]["call"]
if call == "ins":
positions.add(pos)
else:
positions.update(range(pos, pos + len(bases)))
return positions



if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--minos_vcf", help="The minos VCF filepath", required=True)
parser.add_argument("--gvcf", help="The GVCF filepath", required=True)
parser.add_argument("--resistant-positions", help="Path to list of resistant sites", required=True)
parser.add_argument("--output", help="The output VCF file path", required=True)
args = parser.parse_args()

# Sanity check that args are paths
minos_path = Path(args.minos_vcf)
gvcf_path = Path(args.gvcf)
resistant_positions_path = Path(args.resistant_positions)

output_path = Path(args.output)

# Sanity checking arguments
if not minos_path.exists() or not gvcf_path.exists() or not resistant_positions_path.exists():
raise FileNotFoundError("One or more of the input files does not exist!")
if output_path.exists():
raise FileExistsError("Output file already exists!")

# Read in the resistant positions
with open(resistant_positions_path) as f:
resistant_positions = set([int(line.strip()) for line in f])

minos_positions = fetch_minos_positions(minos_path)
to_fetch = sorted(list(resistant_positions - minos_positions))
print(f"Fetching {len(to_fetch)} positions from the GVCF")
# fetch_strs = set([str(pos) for pos in to_fetch])

# Pull out these positions from the GVCF
gvcf_headers, subset = subset_vcf(gvcf_path.as_posix(), to_fetch)

minos_headers, minos_values = subset_vcf(minos_path.as_posix(), sorted(list(minos_positions)))

# Pull out header parts to catch parts which need adding
minos_format = [header for header in minos_headers if "##FORMAT" in header]
minos_info = [header for header in minos_headers if "##INFO" in header]
minos_filter = [header for header in minos_headers if "##FILTER" in header]

gvcf_format = [header for header in gvcf_headers if "##FORMAT" in header]
gvcf_info = [header for header in gvcf_headers if "##INFO" in header]
gvcf_filter = [header for header in gvcf_headers if "##FILTER" in header]

missing_format = [header for header in gvcf_format if header not in minos_format]
missing_info = [header for header in gvcf_info if header not in minos_info]
missing_filter = [header for header in gvcf_filter if header not in minos_filter]


minos_misc_headers = [header for header in minos_headers if "##FORMAT" not in header and "##INFO" not in header and "##FILTER" not in header and "#CHROM" not in header]

chrom_line = [header for header in minos_headers if "#CHROM" in header][0]

with open(output_path, "w") as f:
for misc in minos_misc_headers:
f.write(misc + "\n")

# Merged format
for header in minos_format:
f.write(header + "\n")
for header in missing_format:
f.write(header + "\n")

# Merged info
for header in minos_info:
f.write(header + "\n")
for header in missing_info:
f.write(header + "\n")

# Merged filter
for header in minos_filter:
f.write(header + "\n")
for header in missing_filter:
f.write(header + "\n")

f.write(chrom_line + "\n")

# Minos rows
for row in minos_values:
f.write(row + "\n")

# GVCF rows
for row in subset:
# Replace DP with COV as GVCF doesn't have a COV-like field
if ":DP" in row:
row = row.replace(":DP", ":COV")
elif "\tDP:" in row:
row = row.replace("\tDP:", "\tCOV:")

row = row.split("\t")
# GVCF doesn't explicitly call filter passes, so ensure the calls are picked up
row[6] = "PASS" if row[6] == "." else row[6]
f.write("\t".join(row) + "\n")







8 changes: 4 additions & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = gnomonicus
version = 2.5.7
version = 2.6.0
author = Philip W Fowler, Jeremy Westhead
author_email = philip.fowler@ndm.ox.ac.uk
description = Python code to integrate results of tb-pipeline and provide an antibiogram, mutations and variants
Expand All @@ -16,14 +16,14 @@ package_dir =
packages = find:
python_requires = >=3.10
install_requires =
gumpy>=1.2.7
piezo>=0.8.2
gumpy>=1.3.0
piezo>=0.8.3
numpy
pandas
recursive_diff
zip_safe = False
include_package_data = True
scripts = bin/gnomonicus, bin/gbkToPkl, bin/gnomonicus-table-update
scripts = bin/gnomonicus, bin/gbkToPkl, bin/gnomonicus-table-update, bin/merge-vcfs

[options.package_data]
mypkg = config/*.gbk
Expand Down
Binary file modified tests/test-cases/TEST-DNA-no-overlap.gbk.pkl
Binary file not shown.
Binary file modified tests/test-cases/TEST-DNA.gbk.pkl
Binary file not shown.
154 changes: 154 additions & 0 deletions tests/unit/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2913,6 +2913,160 @@ def test_12():
recursive_eq(ordered(expectedJSON), ordered(actualJSON))


def test_13():
"""Force null calls by passing a min_dp
Input:
NC_045512.2-S_E484K-minos.vcf
Expect output:
variants: 23012g>x
mutations: E484X
predictions: {'AAA': 'F'}
"""
# Setup
setupOutput("13")
reference = gnomonicus.loadGenome("tests/test-cases/NC_045512.2.gbk", False)
catalogue = piezo.ResistanceCatalogue(
"tests/test-cases/NC_045512.2-test-catalogue.csv", prediction_subset_only=True
)

vcf = gumpy.VCFFile(
"tests/test-cases/NC_045512.2-S_E484K-minos.vcf",
ignore_filter=True,
bypass_reference_calls=True,
# Ridiculously high min_dp to force null calls in pre-existing VCF
min_dp=45,
)
vcfStem = "NC_045512.2-S_E484K-minos"

sample = reference + vcf

diff = reference - sample

# Populate the tables
path = "tests/outputs/13/"
gnomonicus.populateVariants(vcfStem, path, diff, True, False)
mutations, referenceGenes, errors = gnomonicus.populateMutations(
vcfStem, path, diff, reference, sample, catalogue, True, False
)
e, phenotypes, _ = gnomonicus.populateEffects(
path, catalogue, mutations, referenceGenes, vcfStem, True, True
)

# Check for expected values within csvs
variants = pd.read_csv(path + f"{vcfStem}.variants.csv")
mutations = pd.read_csv(path + f"{vcfStem}.mutations.csv")
effects = pd.read_csv(path + f"{vcfStem}.effects.csv")
predictions = pd.read_csv(path + f"{vcfStem}.predictions.csv")

assert variants["variant"][0] == "23012g>x"

assert mutations["gene"][0] == "S"
assert mutations["mutation"][0] == "E484X"

assert "AAA" in effects["drug"].to_list()
assert effects["prediction"][effects["drug"].to_list().index("AAA")] == "F"

hits = []
for _, row in predictions.iterrows():
assert row["catalogue_name"] == "gnomonicus_test"
assert row["catalogue_version"] == "v1.0"
assert row["catalogue_values"] == "RFUS"
if row["drug"] == "AAA":
hits.append("AAA")
assert row["prediction"] == "F"
elif row["drug"] == "BBB":
hits.append("BBB")
assert row["prediction"] == "S"
else:
hits.append(None)
assert sorted(hits) == ["AAA", "BBB"]

gnomonicus.saveJSON(
variants,
mutations,
e,
phenotypes,
path,
vcfStem,
catalogue,
gnomonicus.__version__,
-1,
reference,
"",
"",
"",
{},
)

expectedJSON = {
"meta": {
"workflow_version": gnomonicus.__version__,
"guid": vcfStem,
"status": "success",
"workflow_name": "gnomonicus",
"workflow_task": "resistance_prediction",
"reference": "NC_045512",
"catalogue_type": "RFUS",
"catalogue_name": "gnomonicus_test",
"catalogue_version": "v1.0",
},
"data": {
"variants": [
{
"variant": "23012g>x",
"nucleotide_index": 23012,
"gene_name": "S",
"gene_position": 484,
"codon_idx": 0,
"vcf_evidence": {
"GT": [1, 1],
"DP": 44,
"DPF": 0.991,
"COV": [0, 44],
"FRS": 1.0,
"GT_CONF": 300.34,
"GT_CONF_PERCENTILE": 54.73,
"REF": "g",
"ALTS": ["a"],
"POS": 23012,
},
"vcf_idx": 1,
}
],
"mutations": [
{
"mutation": "E484X",
"gene": "S",
"gene_position": 484,
"ref": "gaa",
"alt": "xaa",
}
],
"effects": {
"AAA": [
{
"gene": "S",
"mutation": "E484X",
"prediction": "F",
"evidence": {"row": 2},
},
{"phenotype": "F"},
],
},
"antibiogram": {"AAA": "F", "BBB": "S"},
},
}
expectedJSON = json.loads(json.dumps(expectedJSON, sort_keys=True))

actualJSON = prep_json(
json.load(open(os.path.join(path, f"{vcfStem}.gnomonicus-out.json"), "r"))
)

# assert == does work here, but gives ugly errors if mismatch
# Recursive_eq reports neat places they differ
recursive_eq(ordered(expectedJSON), ordered(actualJSON))


def compare_effects(effects: pd.DataFrame, expected: [str]) -> None:
"""Compare an effects DataFrame with the expected values
Expand Down

0 comments on commit d06efd7

Please sign in to comment.