-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_enrichment.diff.py
executable file
·143 lines (128 loc) · 5.46 KB
/
get_enrichment.diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
desc="""Return enrichment of various types of editing
"""
epilog="""Author:
l.p.pryszcz@gmail.com
Warsaw/Bratislava/Fribourg, 21/07/2015
"""
import os, sys,gzip
import numpy as np
from remove_dbSNP import load_dbSNP
bases = "ACGT"
strands = "+-"
bins = 20
base2rc= {"A": "T", "T": "A", "C": "G", "G": "C", ">": ">", "+": "-", "-": "+"}
def load_diff(infile, snps):
"""Load RNA editing"""
snp2c = {template%(a, b, s): 0 for a in bases for b in bases for s in strands if a!=b}
for i, l in enumerate(gzip.open(infile), 1):
fields = l[:-1].split('\t')
if i==2:
fnames = [fields[i].split()[0] for i in range(4, len(fields), 2)]
continue
if not chrom.startswith('chr'):
chrom = "chr%s"%chrom
def txt2changes(editing, handle, snps, minDepth=20, minFreq=0.01, minSamples=3, template="%s>%s%s"):
"""Return dictionary of snps and their occurencies"""
snp2c = {template%(a, b, s): 0 for a in bases for b in bases for s in strands if a!=b}
for l in handle:
ldata = l.replace('\t\t','\t')[:-1].split('\t')
if l.startswith('#') or not l.endswith('\n') or len(ldata)<3:
continue
# REDiscover output
if ldata[2] in '+-.': #len(ldata)>4
chrom, pos, strand, ref, alt = ldata[:5]
if not chrom.startswith('chr'):
chrom = "chr%s"%chrom
altcov, altfreq = ldata[-2:]
altcov, altfreq = int(altcov), float(altfreq)
# skip if known snp or low cov
if altcov < minDepth or chrom in snps and int(pos) in snps[chrom]:
continue
snp = template%(ref, alt, strand)
# REDiscover.diff2 output # get
elif len(ldata)>4:
chrom, pos, snp = ldata[:3]
sampledata = np.array(map(float, ldata[3:])).reshape(len(ldata[3:])/2, 2)
passed = sum((sampledata[:, 0]>=minDepth) & (sampledata[:, 1]>=minFreq))# & (sampledata[:, 1]<1.0))
#print passed, ldata[3:]
if passed<minSamples: continue
# common.txt
else:
chrom, pos, snp = ldata[:3]
if snp not in snp2c:
continue
snp2c[snp] += 1
# store editing
k = "%s:%s"%(pos, snp)
if chrom not in editing:
editing[chrom] = {k: 1}
elif k not in editing[chrom]:
editing[chrom][k] = 1
else:
editing[chrom][k] += 1
return editing, snp2c
def get_enrichment(fnames, snps, minDepth, minAltfreq, minsamples, snptypes, out=sys.stdout):
# process all files
editing = {}
for fn in fnames:
try:
editing, snp2c = txt2changes(editing, open(fn), snps, minDepth, minAltfreq, minsamples)
except Exception, e:
sys.stderr.write("[ERROR] Couldn't parse %s with error: %s\n"%(fn, str(e)))
continue
total = sum(snp2c.itervalues())
if not total:
sys.stderr.write("[WARNING] No editing in %s\n"%(fn, ))
continue
# get freqs and strand enrichment
strands = []
freqs = []
for snp in sorted(filter(lambda x: x[-1]=="+", snp2c)):
snprc = "".join(base2rc[b] for b in snp)
freq = 1.*(snp2c[snp]+snp2c[snprc])/total
freqs.append(freq)
strands.append((snp2c[snp], snp2c[snp[:-1]+"-"]))
strands = np.array(strands, dtype="float32")
frac = sum(strands.max(axis=1))/strands.sum()
print "%s %s\t%s\t%s\t%s"%(fn, minsamples, total, frac, "\t".join(map(str, freqs)))
# don't output common txt for single file
if len(fnames)>1:
with open("common.txt", "w") as out:
for chrom in editing:
for k, v in editing[chrom].iteritems():
out.write("%s\t%s\t%s\n"%(chrom, k.replace(':', '\t'), v))
def main():
import argparse
usage = "%(prog)s [options]"
parser = argparse.ArgumentParser(usage=usage, description=desc, epilog=epilog, \
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose")
parser.add_argument('--version', action='version', version='1.15b')
parser.add_argument("-i", "--fnames", nargs="+", help="files to preocess")
parser.add_argument("-s", "--snps", default=[], nargs="+", help="dbSNP file(s)")
parser.add_argument("-d", "--minDepth", default=5, type=int,
help="minimal depth of coverage [%(default)s]")
parser.add_argument("-f", "--minAltfreq", default=0.01, type=float,
help="min frequency for RNA editing base [%(default)s]")
parser.add_argument("-n", "--minsamples", nargs="+", default=[1], type=int, help="number of samples [%(default)s]")
# print help if no parameters
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
o = parser.parse_args()
if o.verbose:
sys.stderr.write("Options: %s\n"%str(o))
snps = {}
for snpfn in o.snps:
_snps = load_dbSNP(snpfn)
for c in _snps:
if c not in snps:
snps[c] = set()
snps[c] += _snps[c]
snptypes = ["%s>%s"%(a, b) for a in bases for b in bases if a!=b]
print "#fname\tsites\tstrand enrichment\t"+"\t".join(snptypes)
for n in o.minsamples:
get_enrichment(o.fnames, snps, o.minDepth, o.minAltfreq, n, snptypes)
if __name__=="__main__":
main()