-
Notifications
You must be signed in to change notification settings - Fork 2
/
remove_dbSNP.py
executable file
·69 lines (61 loc) · 2.01 KB
/
remove_dbSNP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
# Strip snps present in dbSNP
# USAGE: remove_dbSNP.py 00-common_all.vcf.gz file1 [file2 ... fileN]
import os, sys, gzip, pickle
def load_dbSNP(vcf, add_chr=1):
"""Return dictionary of SNP chromosomes and positions"""
# load pickle
if os.path.isfile(vcf+".pickle"):
snps=pickle.load(open(vcf+".pickle"))
print "Loaded %s SNPs from %s"%(sum(map(len, snps.values())), vcf)
return snps
snps = {}
j = k = 0
for i, l in enumerate(gzip.open(vcf), 1):
#if i>1000: break
if l.startswith('#'):
continue
j += 1
chrom, pos, rs, ref, alt = l[:-1].split('\t')[:5]
#if len(ref)!=len(alt): continue
# 1 -> chr1
if add_chr and not chrom.startswith('chr'):
chrom = "chr%s"%chrom
if chrom not in snps:
snps[chrom] = set()
snps[chrom].add(int(pos))
k += 1
print "Loaded %s out of %s SNPs"%(k, j)
pickle.dump(snps, open(vcf+".pickle", "w"), 2)
return snps
def parse_editing(fn, snps, out):
"""Ignore positions with known SNPs"""
j = k = 1
for l in open(fn):
if l.startswith("#"):
out.write(l)
continue
if len(l[:-1].split('\t'))<5:
continue
j += 1
chrom, pos, strand, ref, alt = l[:-1].split('\t')[:5]
if chrom in snps and int(pos) in snps[chrom]:
continue
out.write(l)
k += 1
return j, k
def main():
vcf = sys.argv[1]
fnames = sys.argv[2:]
snps = load_dbSNP(vcf)
print "#fname\tnot in dbSNP\tall"
for fn in filter(lambda fn: not fn.endswith('.parsed.txt'), fnames):
outfn = fn+".parsed.txt"
# skip if outfn exists and newer than fn
if os.path.isfile(outfn) and os.stat(fn).st_mtime < os.stat(outfn).st_mtime:
continue
out = open(outfn, "w")
j, k = parse_editing(fn, snps, out)
print "\t".join(map(str, (fn, k, j)))
if __name__=="__main__":
main()