-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhet_variant2.py
497 lines (472 loc) · 23.3 KB
/
het_variant2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
import pandas as pd
import numpy as np
import sys
import argparse
import subprocess
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plot
import matplotlib.patches as patches
from matplotlib import rcParams
from cycler import cycler
import matplotlib.pylab as lab
import glob
from itertools import cycle
rcParams['font.sans-serif'] = 'Arial'
rcParams['pdf.fonttype'] = 42
rcParams['ps.fonttype'] = 42
######################################## DESCRIPTIONS AND PARSER #####################################################
SCRIPT_DESC="""Code estimates heterozygosity across the genome for each individual in a vcf file and plots it as pdfs"""
PARSER=argparse.ArgumentParser(
description=SCRIPT_DESC,
formatter_class=argparse.RawDescriptionHelpFormatter)
## IMPORTANT: if only these two are included, only coverage code will be run
PARSER.add_argument("-v", "--vcf", required=True, help="what is the vcf file to analyze?")
PARSER.add_argument("-o", "--outdir", required=True, help="in which directory should we place the files (full path)?")
# these are for the matrix code. the first part asks if the matrix needs to be created
# in the matrix, 0=homo, 1=het, 2=missing call. for the rest, see below:
PARSER.add_argument("-p", "--plot", required=False, action="store_true", help="IMPORTANT:if you put anything here, only the plotting will take place")
PARSER.add_argument("-m", "--matrix", required=False, action="store_true", help="IMPORTANT:if you put anything here, the matrix code will run")
PARSER.add_argument("-k", "--skip", required=False, action="store_true", help="IMPORTANT: if you put anything here, the program will assume that the coverage file has already been made")
PARSER.add_argument("-r", "--filter_vcf", required=False, action="store_true", help="IMPORTANT: if you put anything here, the program will output a filtered vcf")
PARSER.add_argument("-c", "--covdiff", required=False, default=0.25, help="coverage deviation from average that's allowed the default is 0.25 meaning everything above 0.75*mean_coverage and everything below 1.25*mean_coverage will get a pass. fail=3 in the matrix")
PARSER.add_argument("-qds", "--qds", required=False, default=2,
help="Mininum variant score")
PARSER.add_argument("-sor", "--sor", required=False, default=3,
help="Maximum strand bias score")
PARSER.add_argument("-mq", "--mps", required=False, default=50,
help="Minimum mean mapping quality score")
PARSER.add_argument("-mqrs", "--mqrs", required=False, default=8,
help="Minimum mean quality rank sum score")
PARSER.add_argument("-rprs", "--rprs", required=False, default=8,
help="Minimum read pos rank sum")
# IMPORTANT: if you provide a scaffold info file, the program will assume you want heterozygosity estimates
# if you only provide sin, but do not fill matrix or skip, the program will perform the estimates only.
# if you also provide plot, however, only the plotting will be done
PARSER.add_argument("-s", "--sin", required=False, help="what is the scaffold info file?")
PARSER.add_argument("-w", "--window", required=False, default=100000, help="what is the sliding window size?")
PARSER.add_argument("-l", "--slide", required=False, default=10000, help="by how much will the window slide?")
PARSER.add_argument("-a", "--minallele", required=False, default=20, help="what is minimum number of PASS (0 or 1) alleles in window for the results to be outputted?")
PARSER.add_argument("-f", "--minfreqpass", required=False, default=0.05, help="what is the fraction of PASS (0 or 1) alleles in window for the window to be outputted?")
# This is for plotting purposes
PARSER.add_argument("-i", "--min", required=False, default=-0.02, help ="ylim on plots")
PARSER.add_argument("-x", "--max", required=False, default=1.02, help ="ymax on plots")
PARSER.add_argument("-xx", "--xx", required=False, default=2, help ="x-axis column")
PARSER.add_argument("-yy", "--yy", required=False, default=3, help ="y-axis column")
PARSER.add_argument("-n", "--minor_output", required=False, action="store_true", help ="plot and output minor allele frequency")
##########################################################################################################
ARGIES = PARSER.parse_args()
ARGDICT = {}
ARGDICT["vcf"] = ARGIES.vcf
OUTDIR = ARGIES.outdir
ARGDICT["outdir1"] = OUTDIR+"/info_files"
ARGDICT["outdir2"] = OUTDIR+"/het_files"
ARGDICT["outdir3"] = OUTDIR+"/het_plots"
ARGDICT["plotty"] = ARGIES.plot
ARGDICT["matrix"] = ARGIES.matrix
ARGDICT["skip"] = ARGIES.skip
ARGDICT["filter_vcf"] = ARGIES.filter_vcf
ARGDICT["covdiff"] = float(ARGIES.covdiff)
ARGDICT["qds"] = float(ARGIES.qds)
ARGDICT["mps"] = float(ARGIES.mps)
ARGDICT["sor"] = float(ARGIES.sor)
ARGDICT["mqrs"] = float(ARGIES.mqrs)
ARGDICT["rprs"] = float(ARGIES.rprs)
ARGDICT["sin"] = ARGIES.sin
ARGDICT["window"] = int(ARGIES.window)
ARGDICT["slide"] = int(ARGIES.slide)
ARGDICT["minallele"] = int(ARGIES.minallele)
ARGDICT["minfreqpass"] = float(ARGIES.minfreqpass)
ARGDICT["xx"] = int(ARGIES.xx)
ARGDICT["yy"]= int(ARGIES.yy)
ARGDICT["ylim_low"] = float(ARGIES.min)
ARGDICT["ylim_high"] = float(ARGIES.max)
ARGDICT["minor_output"] = ARGIES.minor_output
########################################### FUNCTIONS ##########################################################
# COVERAGE
def coverage():
cov = {}
subprocess.call("mkdir %s"%(ARGDICT["outdir1"]), shell=True)
outcov = open(ARGDICT["outdir1"]+"/coverageinfo.txt", "w")
outcov.write("%s\t%s\t%s\n"%("#STRAIN", "AVERAGE_DEPTH", "SD_DEPTH"))
with open(ARGDICT["vcf"], "r") as openvcf:
for line in openvcf:
if line[0:2] == "##": #this contains the lengths of the scaffold for the original pass I don't care about it.
pass
elif line[0:6] == "#CHROM": #need to store all the variable names for the samples and which columns correspond to it.
header = line.strip().split("\t")
header_strains = header[9:]
for strain in header_strains:
cov[strain] = {}
cov[strain]["count"] = 0.0
cov[strain]["covva"] = 0.0
cov[strain]["all_cov"] = []
else:
if line[0] != "#": #need average coverage for parent1, parent2, sel, and unsel
SNP_line = line.strip().split("\t")
ref_allele = SNP_line[3]
alt_alleles = SNP_line[4].split(",")
alt_snp = 1
for alt in alt_alleles:
if len(alt) > 1:
alt_snp = len(alt)
if len(ref_allele) == 1 and alt_snp == 1:
haplotypes = SNP_line[9:]
index = 0
for strain in header_strains:
choice = haplotypes[index]
if "./" not in choice:
reads = choice.split(":")[1].split(",")
parent_cov = sum([int(i) for i in reads])
cov[strain]["count"] += 1
cov[strain]["covva"] += parent_cov
cov[strain]["all_cov"].append(parent_cov)
index += 1
else:
index += 1
for strain in header_strains:
avecovvcf = str(cov[strain]["covva"]/cov[strain]["count"])
stdcovvcf = np.std(cov[strain]["all_cov"])
outcov.write("%s\t%s\t%s\n"%(strain, avecovvcf, stdcovvcf))
outcov.close()
def line_parser(line):
"""Parses VCF string to get mapping quality info"""
linedict = {}
line = line.split(";")
for keyval in line:
keyval = keyval.split("=")
if len(keyval) > 1 and "|" not in keyval[1]:
values = keyval[1].split(",")
if len(values) == 1:
linedict[keyval[0]] = float(keyval[1])
else:
linedict[keyval[0]] = min([float(i) for i in values])
return(linedict)
# MATRIX
def get_matrix():
cov = {}
with open(COVERAGEFILE, "r") as opencov:
for line in opencov:
if line[0] != "#":
linestrip = (line.rstrip()).split("\t")
st = linestrip[0]
co = float(linestrip[1])
cov[st] = co
outfile = open(ARGDICT["outdir1"]+"/matrix.txt", "w")
if ARGDICT["filter_vcf"]:
outvcf = open(ARGDICT["outdir1"]+"/filtered.vcf", "w")
with open(ARGDICT["vcf"], "r") as openvcf:
for line in openvcf:
if line[0:2] == "##": #this contains the lengths of the scaffold for the original pass I don't care about it.
if ARGDICT["filter_vcf"]:
outvcf.write(line)
elif line[0:6] == "#CHROM": #need to store all the variable names for the samples and which columns correspond to it.
header = line.strip().split("\t")
header_strains = header[9:]
outfile.write("\t".join(["chrom", "pos"]+header_strains)+"\n")
if ARGDICT["filter_vcf"]:
outvcf.write(line)
else:
if line[0] != "#": #need average coverage for parent1, parent2, sel, and unsel
SNP_line = line.strip().split("\t")
SNP_scaff = SNP_line[0]
SNP_pos = SNP_line[1]
ref_allele = SNP_line[3]
alt_alleles = SNP_line[4].split(",")
alt_snp = 1
for alt in alt_alleles:
if len(alt) > 1:
alt_snp = len(alt)
if len(ref_allele) == 1 and alt_snp == 1:
haplotypes = SNP_line[9:]
scores = [SNP_scaff, SNP_pos]
index = 0
for choice in haplotypes:
# is the variant there. 2 = not there
if "./" in choice:
scores.append("2")
index = index + 1
continue
# is the COVERAGEFILE okay?
reads = choice.split(":")[1].split(",")
covey = sum([int(i) for i in reads])
strain = header_strains[index]
avecov = cov[strain]
if covey < avecov*(1-ARGDICT["covdiff"]) or covey > avecov*(1+ARGDICT["covdiff"]):
scores.append("3")
index = index +1
continue
# this is the read quality filter
stuff = SNP_line[7]
quals = line_parser(stuff)
if ("QD" in quals and "MQ" in quals and "SOR" in quals
and "MQRankSum" in quals and "ReadPosRankSum" in quals):
if (quals["QD"] < ARGDICT["qds"]
or quals["MQ"] < ARGDICT["mps"]
or quals["SOR"] >= ARGDICT["sor"]
or quals["MQRankSum"] < -ARGDICT["mqrs"]
or quals["MQRankSum"] > ARGDICT["mqrs"]
or quals["ReadPosRankSum"] < -ARGDICT["rprs"]
or quals["ReadPosRankSum"] > ARGDICT["rprs"]):
scores.append("4")
index = index +1
continue
elif ("QD" in quals and "MQ" in quals and "SOR" in quals):
if (quals["QD"] < ARGDICT["qds"]
or quals["MQ"] < ARGDICT["mps"]
or quals["SOR"] >= ARGDICT["sor"]):
scores.append("4")
index = index +1
continue
else:
scores.append("5")
index = index +1
continue
# now the genotype pass
genotype = choice.split(":")[0]
numeric_geno = set([int(i) for i in genotype.split("/")])
alleles = genotype.split("/")
all_uniques = len(list(set(alleles)))
if all_uniques == 1:
reforalt = int(alleles[0])
if reforalt == 0:
scores.append("10")
index = index +1
else:
scores.append("-1")
index = index +1
else:
select_reads = [int(reads[i]) for i in numeric_geno]
minor = min(select_reads)/float(sum(select_reads))
scores.append("%s"%(minor))
index = index + 1
outfile.write("\t".join(scores)+"\n")
if ARGDICT["filter_vcf"]:
passing_scores = [i for i in scores[2:] if
0 <= float(i) <= 1 or int(i) == -1 or int(i) == 10]
if len(passing_scores) == len(scores[2:]):
outvcf.write(line)
if ARGDICT["filter_vcf"]:
outvcf.close()
outfile.close()
# this is to get the correct overall genomic position
def scale_dict(info_file):
info_read = open(info_file, "r")
info_dict = {}
final_lenny = 0
lenny = 0
for line in info_read:
linetab = (line.rstrip()).split("\t")
scaffy = linetab[0]
final_lenny = final_lenny + lenny
lenny = int(linetab[1])
info_dict[scaffy] = final_lenny
info_read.close()
return(info_dict)
## SLIDING WINDOW FOR HETEROZYGOSITY ACROSS THE GENOMES
def homo():
subprocess.call("mkdir %s"%(ARGDICT["outdir2"]), shell=True)
snps = pd.read_table(MATRIX_FILE, sep="\t")
### estimates for entire genome first ###
outhetstat = open(ARGDICT["outdir1"] + "/" + "allhet.txt", "w")
outhetstat.write("strain"+"\t"+"Genome_wide_heterozygosity"+"\t"+"Genome_wide_minor_freq"+"\t"+"Fraction_windows_with_het>0.1"+"\n")
outhetdict = {}
for col in snps.columns[2:]:
#print("col", col)
section = snps[col]
#print(section, "section")
homos = len(section[section==-1])
heteros = len(section[(section >= 0) & (section <= 1)])
if heteros > 0:
minorfreq = np.mean(section[(section >= 0) & (section <= 1)])
else:
minorfreq = 0
npass = homos+heteros
#print("Npass", npass)
passer = npass/float(len(section))
heterofix = heteros/float(npass)
strainy = str(col)
print(strainy, minorfreq)
outhetdict[strainy] = {}
outhetdict[strainy]["genome_wide_het"] = heterofix
outhetdict[strainy]["genome_wide_minor"] = minorfreq
outhetdict[strainy]["n_windows"] = 0
outhetdict[strainy]["high_het"] = 0
outfile = open(ARGDICT["outdir2"] + "/" + col + ".txt", "w") # to erase previous files if needed
outfile.close()
### end of estimate for entire genome ###
scaffolds = snps['chrom'].unique()
for scaff in scaffolds:
if (scaff in SHADER) == False:
continue
scaff_seg = snps[snps['chrom']==scaff]
beg = 0
win = ARGDICT["window"]
end = ARGDICT["window"]
slide = ARGDICT["slide"]
if max(scaff_seg['pos']) < win:
continue
while end < max(scaff_seg['pos']):
chunk = scaff_seg[(scaff_seg['pos'] > beg) & (scaff_seg['pos'] < end)]
ntotal = len(chunk)
med = beg + win/2
medpos = med + SHADER[scaff]
for col in chunk.columns[2:]:
#print("col", col)
section = chunk[col]
#print(section, "section")
homos = len(section[section==-1])
heteros = len(section[(section >= 0) & (section <= 1)])
if heteros > 0:
minorfreq = np.mean(section[(section >= 0) & (section <= 1)])
else:
minorfreq = 0
npass = homos+heteros
if len(section) > 0:
passer = npass/float(len(section))
else:
passer = 0
if npass >= ARGDICT["minallele"] and passer >= ARGDICT["minfreqpass"]:
heterofix = heteros/float(npass)
strainy = str(col)
outhetdict[strainy]["n_windows"] = outhetdict[strainy]["n_windows"]+1
if heterofix > 0.1:
outhetdict[strainy]["high_het"] = outhetdict[strainy]["high_het"] + 1
outfile = open(ARGDICT["outdir2"] + "/" + col + ".txt", "a")
outfile.write("\t".join([scaff, str(med), str(medpos), str(heterofix), str(minorfreq), str(npass), str(ntotal)])+"\n")
outfile.close()
beg = beg + slide
end = end + slide
for strain_dict in sorted(outhetdict):
high_het_count = str(outhetdict[strain_dict]["high_het"]/float(outhetdict[strain_dict]["n_windows"]))
outhetstat.write(strain_dict + "\t" + str(outhetdict[strain_dict]["genome_wide_het"]) + "\t" + str(outhetdict[strain_dict]["genome_wide_minor"]) + "\t" + high_het_count + "\n")
outhetstat.close()
## PLOTTING
# alternating colors
def plotting():
subprocess.call("mkdir %s"%(ARGDICT["outdir3"]), shell=True)
alterna = ['green', 'blue', 'violet', 'purple', 'royalblue', 'orange', 'fuchsia', 'pink']
plot.rc('axes', prop_cycle=(cycler('color', alterna)))
fileys = glob.glob(ARGDICT["outdir2"]+"/*.txt")
# now the plotting
print("***********************")
pool = cycle(alterna)
for filey in fileys:
fig = plot.figure(figsize=(15, 5), dpi=10000)
typies = filey.split("/")[-1].split(".")[0]
tsvet = next(pool)
print("%s : %s" %(tsvet, filey))
b = pd.read_table(filey, sep="\t", header=None)
if ARGDICT["minor_output"]:
plot.plot(b.iloc[0:,ARGDICT["xx"]],b.iloc[0:,ARGDICT["yy"]+1], lw=1, color="0.75")
plot.plot(b.iloc[0:,ARGDICT["xx"]],b.iloc[0:,ARGDICT["yy"]], lw=2, color=tsvet)
# ACCESSORY STUFF TO MAKE IT PRETTY
# how far will the plot go
lab.ylim(ARGDICT["ylim_low"], ARGDICT["ylim_high"])
maxx = max(b.iloc[0:,ARGDICT["xx"]])
lab.xlim(0, maxx)
# this are the variables for the shading below
old = None
shade = False
# here comes the shading
for contig in sorted(SHADER):
val = SHADER[contig]
if old != None and shade == True:
plot.axvspan(old, val, color='0.87', alpha=0.5)
shade = False
else:
if old != None:
shade = True
old = SHADER[contig]
# the last one
if SHADER == True:
plot.axvspan(old, maxx, color='0.87', alpha=0.5)
# grid and labels
plot.locator_params(axis='x',nbins=15)
plot.grid(True, axis="y")
# for better labeling:
z = [int(i) for i in plot.xticks()[0]][1:-1] # this is to create labels with numbers
labs = []
for i in plot.xticks()[0]:
j = int(i)
if len(str(j)) <= 2:
labs.append(str(round(int(j)/float(1), 3)))
elif 3 <= len(str(j)) <= 6:
labs.append(str(round(int(j)/float(1000), 3))+" Kb")
elif 3 <= len(str(j)) <= 9:
labs.append(str(int(int(j)/float(1000000))))
else:
labs.append(str(round(int(j)/float(1000000000), 1))+" Gb")
labs = labs[1:-1]
plot.xticks(z, labs)
plot.xlabel("Genomic Position (Mb)", fontsize = 22)
plot.ylabel("Fraction of heterozygous SNPs", fontsize = 22)
plot.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='on', # ticks along the bottom edge are off
top='on', # ticks along the top edge are off
labelbottom='on',
labelsize = 22) # labels along the bottom edge are off
plot.tick_params(
axis='y', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
right='off', # ticks along the bottom edge are off
left='on', # ticks along the top edge are on
labelbottom='off', # labels along the bottom edge are off
labelsize = 22) # changes label size to whatevs
fig.savefig(ARGDICT["outdir3"]+"/"+typies+".pdf", bbox_inches='tight')
fig.clf()
plot.close('all')
print("***********************")
############################################ THIS PART OF THE SCRIPT ACTUALLY RUNS THE FUNCTIONS ###################
if ARGDICT["plotty"]:
print("plotting every file")
SHADER = scale_dict(ARGDICT["sin"])
plotting()
else:
if not ARGDICT["sin"]:
if not ARGDICT["matrix"]:
# creates the coverage info file
print("creating coverage info file")
coverage()
else:
if ARGDICT["skip"]:
COVERAGEFILE = ARGDICT["outdir1"]+"/coverageinfo.txt"
print("creating matrix")
get_matrix()
else:
# creates the coverage info file
print("creating coverage info file")
coverage()
COVERAGEFILE = ARGDICT["outdir1"]+"/coverageinfo.txt"
# creates the matrix
print("creating matrix")
get_matrix()
else:
if not ARGDICT["matrix"]:
print("performing heterozygosity estimates for each individual")
MATRIX_FILE = ARGDICT["outdir1"]+"/"+"matrix.txt"
# process the scaffold information
SHADER = scale_dict(ARGDICT["sin"])
homo()
# plots heterozygosity
print("plotting every file")
plotting()
else:
if not ARGDICT["skip"]:
# creates the coverage info file
print("creating coverage info file")
coverage()
COVERAGEFILE = ARGDICT["outdir1"]+"/coverageinfo.txt"
print("creating matrix")
get_matrix()
MATRIX_FILE = ARGDICT["outdir1"]+"/"+"matrix.txt"
# processing the scaffold information
SHADER = scale_dict(ARGDICT["sin"])
print("performing heterozygosity estimates for each individual")
homo()
# plots heterozygosity
print("plotting every file")
plotting()
############################################ THE END ##############################################