Skip to content

Commit

Permalink
added coverage eval (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonas-fuchs committed Aug 21, 2023
1 parent 626e24c commit 9698f2b
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 10 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ usage: virheat <folder containing vcfs> <output dir> -l or -g [additional argum

```
positional arguments:
input folder containing vcf files and output folder
input folder containing vcf (and tsv) files and output folder
optional arguments:
-h, --help show this help message and exit
Expand All @@ -71,11 +71,14 @@ optional arguments:
delete mutations with frequencies present in all
samples (default: True)
--sort, --no-sort sort alphanumerically (default: False)
--min-cov 20 display mutations covered at least x time (only if per base cov tsv files are provided)
-v, --version show program's version number and exit
```

You need to either provide the length of your reference genome or if you want to get the sequence annotation you will need to provide the gff3 file.

Additionally, you can also analyse if mutations are sufficiently covered and display non-covered cells in grey. For that first create a per base coverage tsv files for each bam file with [Qualimap](http://qualimap.conesalab.org/) and provide it in the same folder as the vcf files. Give them the same name as your vcf files.

---

**Important disclaimer:**
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
matplotlib>=3.5.1
numpy>=1.23.3
vcfpy>=0.13.6
pandas>=1.4.4
2 changes: 1 addition & 1 deletion virheat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""plot vcf data as a heatmap mapped to a virus genome"""
_program = "virheat"
__version__ = "0.4"
__version__ = "0.5"
21 changes: 16 additions & 5 deletions virheat/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ def get_args(sysargs):
"""
parser = argparse.ArgumentParser(
prog=_program,
usage='''\tvirheat <folder containing vcfs> <output dir> -l or -g [additional arguments]''')
usage='''\tvirheat <folder containing input files (vcf/tsv)> <output dir> -l or -g [additional arguments]''')
parser.add_argument(
"input",
nargs=2,
help="folder containing vcf files and output folder"
help="folder containing input files and output folder"
)
parser.add_argument(
"-l",
Expand Down Expand Up @@ -76,6 +76,13 @@ def get_args(sysargs):
default=False,
help="sort alphanumerically"
)
parser.add_argument(
"--min-cov",
type=int,
metavar="20",
default=20,
help="display mutations covered at least x time (only if per base cov tsv files are provided)"
)
parser.add_argument(
"-v",
"--version",
Expand All @@ -98,7 +105,7 @@ def main(sysargs=sys.argv[1:]):
args = get_args(sysargs)

# get vcf files and sort
vcf_files = data_prep.get_vcf_files(args.input[0])
vcf_files = data_prep.get_files(args.input[0], "vcf")
if args.sort:
vcf_files = sorted(vcf_files, key=lambda x: data_prep.get_digit_and_alpha(os.path.basename(x)))

Expand All @@ -107,6 +114,8 @@ def main(sysargs=sys.argv[1:]):
frequency_array = data_prep.create_freq_array(unique_mutations, frequency_lists)
if args.delete:
frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations)
# annotate low coverage if per base coveage from qualimap was provided
data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations)

# define relative locations of all items in the plot
n_samples = len(frequency_array)
Expand All @@ -123,7 +132,7 @@ def main(sysargs=sys.argv[1:]):
gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path)
# issue a warning if #CHROM and gff3 do not match
if gff3_ref_name not in reference_name and reference_name not in gff3_ref_name:
print("\033[31m\033[WARNING:\033[0m gff3 reference does not match the vcf reference!")
print("\033[31m\033[1mWARNING:\033[0m gff3 reference does not match the vcf reference!")
genome_end = data_prep.get_genome_end(gff3_info)
annotation_list = args.gff3_annotations.split(",")
genes_with_mutations, n_tracks = data_prep.create_track_dict(unique_mutations, gff3_info, annotation_list)
Expand All @@ -144,7 +153,9 @@ def main(sysargs=sys.argv[1:]):
fig, ax = plt.subplots(figsize=[y_size, x_size])

# plot all elements
cmap_cells = cm.ScalarMappable(norm=colors.Normalize(0, 1), cmap=colormaps["gist_heat_r"])
cmap = cm.gist_heat_r
cmap.set_bad('silver', 1.)
cmap_cells = cm.ScalarMappable(norm=colors.Normalize(0, 1), cmap=cmap)
plotting.create_heatmap(ax, frequency_array, cmap_cells)
mutation_set = plotting.create_genome_vis(ax, genome_y_location, n_mutations, unique_mutations, genome_end)
if args.gff3_path is not None:
Expand Down
30 changes: 27 additions & 3 deletions virheat/scripts/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@

# LIBS
import numpy as np
import pandas as pd


def get_vcf_files(path):
def get_files(path, type):
"""
returns a list of vcf files in a paticular folder
"""
return list(pathlib.Path(path).glob('*.vcf'))
return list(pathlib.Path(path).glob(f'*.{type}'))


def get_digit_and_alpha(filename):
Expand Down Expand Up @@ -60,7 +61,7 @@ def read_vcf(vcf_file):
lines = [l.split("\t") for l in f if not l.startswith('#')]
# check if vcf is empty
if not lines:
print(f"WARNING: {vcf_file} is empty!")
print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} is empty!")
# get standard headers as keys
for key in header[0:6]:
vcf_dict[key] = []
Expand Down Expand Up @@ -147,6 +148,29 @@ def create_freq_array(unique_mutations, frequency_lists):
return np.array(frequency_array)


def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations):
"""
Insert nan values into np array if position is not covered. Needs
per base coverage tsv files created by bamqc
"""

# get tsv files
per_base_coverage_files = get_files(coverage_dir, "tsv")
if per_base_coverage_files:
for i, (file_name, array) in enumerate(zip(file_names, frequency_array)):
if file_name not in [os.path.splitext(os.path.basename(file))[0] for file in per_base_coverage_files]:
print(f"\033[31m\033[1mWARNING:\033[0m {file_name} was not found in tsv files.")
continue
tsv_file = [file for file in per_base_coverage_files if os.path.splitext(os.path.basename(file))[0] == file_name][0]
coverage = pd.read_csv(tsv_file, sep="\t")
for j, (mutation, frequency) in enumerate(zip(unique_mutations, array)):
mut_pos = int(mutation.split("_")[0])
if all([frequency == 0, coverage[coverage["pos"] == mut_pos]["coverage"].iloc[0] <= min_coverage]):
frequency_array[i][j] = np.NAN

return np.ma.array(frequency_array, mask=np.isnan(frequency_array))


def delete_common_mutations(frequency_array, unique_mutations):
"""
delete rows of common mutations (non-zero) that are in the array
Expand Down

0 comments on commit 9698f2b

Please sign in to comment.