-
Notifications
You must be signed in to change notification settings - Fork 10
/
calculate_phylocsf.py
101 lines (85 loc) · 3.46 KB
/
calculate_phylocsf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
'''
the script is modified from Mikael Hussius @ SciLifeLab, https://github.com/hussius/gff-phylocsf-human
download the following bigwig files first
# wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+0.bw
# wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+1.bw
# wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+2.bw
# wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-0.bw
# wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-1.bw
# wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-2.bw
'''
import sys
import os
import pyBigWig as pw
import numpy as np
def predict_coding(vec):
coding = "no"
for v in vec:
if not v: continue
if v > 0: coding = "yes"
return(coding)
if len(sys.argv)<4:
sys.exit("USAGE: python " + sys.argv[0] + "<GFF file> <BigWig file path> <output file>")
infile = sys.argv[1]
bw_file_path = sys.argv[2]
outfile = sys.argv[3]
regs = []
chrom={}
starts={}
ends={}
peptide={}
for line in open(infile):
if not line.startswith("chr"):
continue
fields = line.strip().split()
(chr, start, end, pept) = (fields[0], fields[3], fields[4], fields[8])
if not pept.startswith("Parent="): continue
name = chr+":"+start+"-"+end
chrom[name]=chr
starts[name]=int(start)
ends[name]=int(end)
peptide[name]=pept.split("=")[1]
regs.append(name)
scores = {}
rpathbase = os.path.join(bw_file_path,"PhyloCSF")
for rf in ["+0","+1","+2","+3","-0","-1","-2","-3"]:
rpath = rpathbase + rf + ".bw"
if os.path.isfile(rpath):
sys.stderr.write("Searching PhyloCSF reading frame " + rf + "\n")
bw = pw.open(rpath)
frame_score = {}
count = 0
for r in regs:
count += 1
if(count % 50 ==0): sys.stderr.write('\tProcessed ' + str(count) + " peptides out of " + str(len(regs)) + "\n")
sys.stderr.flush()
try:
score = bw.stats(chrom[r], starts[r], ends[r])[0]
except RuntimeError:
pass
frame_score[r] = score
scores[rf] = frame_score
bw.close()
else:
sys.stderr.write("%s doesn't exist \n" % rpath)
output = open(outfile,"w")
output.write("\t".join(["Peptide","PhyloCSF+0.score","PhyloCSF+1.score","PhyloCSF+2.score","PhyloCSF-0.score","PhyloCSF-1.score","PhyloCSF-2.score","PhyloCSF_prediction"])+"\n")
pep_scores={}
for r in regs:
scoreList = [scores["+0"][r], scores["+1"][r], scores["+2"][r], scores["-0"][r], scores["-1"][r], scores["-2"][r]]
seq = peptide[r]
if seq not in pep_scores:
pep_scores[seq]=scoreList
else: # this is to consider splice junction peptides which have two regions separated in gff file, we take mean phylocsf score of two regions
for i in range(0,len(scoreList)):
value = scoreList[i]
if value is None and pep_scores[seq][i] is None:
continue
elif None in [value, pep_scores[seq][i]]:
pep_scores[seq][i] = value if value else pep_scores[seq][i]
else:
pep_scores[seq][i] = (pep_scores[seq][i] + value)/2
for seq in pep_scores:
scoreList = pep_scores[seq]
row = [seq]+['NA' if x is None else str(x) for x in scoreList] + [predict_coding(scoreList)]
output.write('\t'.join(row) + '\n')