added coverage eval (#6)

jonas-fuchs · Aug 21, 2023 · 9698f2b · 9698f2b
1 parent 626e24c
commit 9698f2b
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ usage: virheat <folder containing vcfs> <output dir> -l or -g [additional argum
 
 ```
 positional arguments:
- input folder containing vcf files and output folder
+ input folder containing vcf (and tsv) files and output folder
 
 optional arguments:
  -h, --help show this help message and exit
@@ -71,11 +71,14 @@ optional arguments:
  delete mutations with frequencies present in all
  samples (default: True)
  --sort, --no-sort sort alphanumerically (default: False)
+ --min-cov 20 display mutations covered at least x time (only if per base cov tsv files are provided)
  -v, --version show program's version number and exit
 ```
 
 You need to either provide the length of your reference genome or if you want to get the sequence annotation you will need to provide the gff3 file.
 
+Additionally, you can also analyse if mutations are sufficiently covered and display non-covered cells in grey. For that first create a per base coverage tsv files for each bam file with [Qualimap](http://qualimap.conesalab.org/) and provide it in the same folder as the vcf files. Give them the same name as your vcf files.
+
 ---
 
 **Important disclaimer:**

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 matplotlib>=3.5.1
 numpy>=1.23.3
 vcfpy>=0.13.6
+pandas>=1.4.4
diff --git a/virheat/__init__.py b/virheat/__init__.py
@@ -1,3 +1,3 @@
 """plot vcf data as a heatmap mapped to a virus genome"""
 _program = "virheat"
-__version__ = "0.4"
+__version__ = "0.5"
diff --git a/virheat/command.py b/virheat/command.py
@@ -26,11 +26,11 @@ def get_args(sysargs):
  """
  parser = argparse.ArgumentParser(
  prog=_program,
- usage='''\tvirheat <folder containing vcfs> <output dir> -l or -g [additional arguments]''')
+ usage='''\tvirheat <folder containing input files (vcf/tsv)> <output dir> -l or -g [additional arguments]''')
  parser.add_argument(
  "input",
  nargs=2,
- help="folder containing vcf files and output folder"
+ help="folder containing input files and output folder"
  )
  parser.add_argument(
  "-l",
@@ -76,6 +76,13 @@ def get_args(sysargs):
  default=False,
  help="sort alphanumerically"
  )
+ parser.add_argument(
+ "--min-cov",
+ type=int,
+ metavar="20",
+ default=20,
+ help="display mutations covered at least x time (only if per base cov tsv files are provided)"
+ )
  parser.add_argument(
  "-v",
  "--version",
@@ -98,7 +105,7 @@ def main(sysargs=sys.argv[1:]):
  args = get_args(sysargs)
 
  # get vcf files and sort
- vcf_files = data_prep.get_vcf_files(args.input[0])
+ vcf_files = data_prep.get_files(args.input[0], "vcf")
  if args.sort:
  vcf_files = sorted(vcf_files, key=lambda x: data_prep.get_digit_and_alpha(os.path.basename(x)))
 
@@ -107,6 +114,8 @@ def main(sysargs=sys.argv[1:]):
  frequency_array = data_prep.create_freq_array(unique_mutations, frequency_lists)
  if args.delete:
  frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations)
+ # annotate low coverage if per base coveage from qualimap was provided
+ data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations)
 
  # define relative locations of all items in the plot
  n_samples = len(frequency_array)
@@ -123,7 +132,7 @@ def main(sysargs=sys.argv[1:]):
  gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path)
  # issue a warning if #CHROM and gff3 do not match
  if gff3_ref_name not in reference_name and reference_name not in gff3_ref_name:
- print("\033[31m\033[WARNING:\033[0m gff3 reference does not match the vcf reference!")
+ print("\033[31m\033[1mWARNING:\033[0m gff3 reference does not match the vcf reference!")
  genome_end = data_prep.get_genome_end(gff3_info)
  annotation_list = args.gff3_annotations.split(",")
  genes_with_mutations, n_tracks = data_prep.create_track_dict(unique_mutations, gff3_info, annotation_list)
@@ -144,7 +153,9 @@ def main(sysargs=sys.argv[1:]):
  fig, ax = plt.subplots(figsize=[y_size, x_size])
 
  # plot all elements
- cmap_cells = cm.ScalarMappable(norm=colors.Normalize(0, 1), cmap=colormaps["gist_heat_r"])
+ cmap = cm.gist_heat_r
+ cmap.set_bad('silver', 1.)
+ cmap_cells = cm.ScalarMappable(norm=colors.Normalize(0, 1), cmap=cmap)
  plotting.create_heatmap(ax, frequency_array, cmap_cells)
  mutation_set = plotting.create_genome_vis(ax, genome_y_location, n_mutations, unique_mutations, genome_end)
  if args.gff3_path is not None:

diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py
@@ -10,13 +10,14 @@
 
 # LIBS
 import numpy as np
+import pandas as pd
 
 
-def get_vcf_files(path):
+def get_files(path, type):
  """
  returns a list of vcf files in a paticular folder
  """
- return list(pathlib.Path(path).glob('*.vcf'))
+ return list(pathlib.Path(path).glob(f'*.{type}'))
 
 
 def get_digit_and_alpha(filename):
@@ -60,7 +61,7 @@ def read_vcf(vcf_file):
  lines = [l.split("\t") for l in f if not l.startswith('#')]
  # check if vcf is empty
  if not lines:
- print(f"WARNING: {vcf_file} is empty!")
+ print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} is empty!")
  # get standard headers as keys
  for key in header[0:6]:
  vcf_dict[key] = []
@@ -147,6 +148,29 @@ def create_freq_array(unique_mutations, frequency_lists):
  return np.array(frequency_array)
 
 
+def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations):
+ """
+ Insert nan values into np array if position is not covered. Needs
+ per base coverage tsv files created by bamqc
+ """
+
+ # get tsv files
+ per_base_coverage_files = get_files(coverage_dir, "tsv")
+ if per_base_coverage_files:
+ for i, (file_name, array) in enumerate(zip(file_names, frequency_array)):
+ if file_name not in [os.path.splitext(os.path.basename(file))[0] for file in per_base_coverage_files]:
+ print(f"\033[31m\033[1mWARNING:\033[0m {file_name} was not found in tsv files.")
+ continue
+ tsv_file = [file for file in per_base_coverage_files if os.path.splitext(os.path.basename(file))[0] == file_name][0]
+ coverage = pd.read_csv(tsv_file, sep="\t")
+ for j, (mutation, frequency) in enumerate(zip(unique_mutations, array)):
+ mut_pos = int(mutation.split("_")[0])
+ if all([frequency == 0, coverage[coverage["pos"] == mut_pos]["coverage"].iloc[0] <= min_coverage]):
+ frequency_array[i][j] = np.NAN
+
+ return np.ma.array(frequency_array, mask=np.isnan(frequency_array))
+
+
 def delete_common_mutations(frequency_array, unique_mutations):
  """
  delete rows of common mutations (non-zero) that are in the array