-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_phylosift_sts.py
139 lines (123 loc) · 4.5 KB
/
parse_phylosift_sts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/common/usg/languages/python/2.7.4/bin/python
import sys
"""parse_phylosift_sts.py: takes a sequence_taxa_summary.txt file from the program phyosift (run on only one bin/genome)
and uses a cutoff probability and percentage to determine the taxonomy of that bin/genome"""
__author__ = "Sarah Stevens"
__email__ = "sstevens2@wisc.edu"
def usage():
print "Usage: parse_phylosift_sts.py inputfile cutoffprob. cutoffpercentage outnameprefix BacteriaArchaeaMarkersOnly(True/False?)"
print "Example: parse_phylosift_sts.py sequence_taxa_summary.txt .90 .70 sts_mygenome.txt True"
print "The cutoffprob. and cutoffperc. need to be in decimal form, not percentages"
print "The cutoffprob. removes any marker genes with probabilies below that value, for each level of taxonomy."
print "The cutoffperc. only allows for classification if the marker genes match at that level of taxonomy above this value."
print "Must use 'True' or 'False' exactly for using only the Bacterial/Archaeal Marker genes"
if len(sys.argv) !=6:
usage()
exit()
#Bring in the inputs and set up output
inputfile=open(sys.argv[1], "rU")
list1 =inputfile.readlines()
co_prob=float(sys.argv[2])
co_perc=float(sys.argv[3])
outname=sys.argv[4]
concatonly=bool(False)
if sys.argv[4] == "True":
concatonly=bool(True)
outputfile=open(outname+".prob+"+str(co_prob)+".perc"+str(co_perc)+".txt", "w")
def prob_check(list2check): #Removes all lines below the co_prob or that have 'no rank', returns list with all fines above or equal to the co_prob
outlist=[]
#checklist=[]
for line in list2check:
if not (float(line[5]) < co_prob):
if not (line[3] == 'no rank'):
#checklist.append(line[5])
outlist.append(line)
#print checklist
return outlist
def match_check(list2check, rank2check): #Function for checking specific level of taxonomy for matching, returns if matches above the co_perc. and if True, what the match is
ranklist = []
markerlist=[]
#adds all the lines that match the rank to ranklist
for line in list2check:
if line[3] == rank2check:
ranklist.append(line)
markerlist.append(line[-1])
rankcount=[]
#checks all possible combos for matching ranks with that line
for line in ranklist:
count=(len(line[1].split("."))/2)
#count=0
for line2 in ranklist:
if line != line2:
if line[4] == line2[4]:
#count+=1
count=count+(len(line2[1].split("."))/2)
rankcount.append(count)
num_matches= max(rankcount)
total=len(rankcount)
bm_index=rankcount.index(max(rankcount))
bm_name =ranklist[bm_index][4]
index=0
mmlist=[]
for num in rankcount:
if num != max(rankcount):
mmlist.append(ranklist[index][1])
index+=1
#print num_matches, total, (float(num_matches)/float(total)), co_perc
if total <= 1:
return False, mmlist
if (float(num_matches)/float(total)) < co_perc:
return False, mmlist
else:
outputfile.write(rank2check+": "+bm_name+"\n")
print(bm_name)
return True, mmlist
def trueMatch(matchTF): #terminates program if matchTF is false
if matchTF == False:
outputfile.close()
sys.exit()
def mremover(list2check, mlist): #removes markers ruled out from previous levels
outputlist=[]
for line in list2check:
if (line[1] not in mlist) and (line[1] not in outputlist):
if line not in outputlist:
outputlist.append(line)
return outputlist
def checkrank(list2check, rank): #checks that all the makers have that rank, returns list of markers to remove
missmarklist=[]
rankmatchlist=[]
for line in list2check:
if (line[3] == rank):
rankmatchlist.append(line[1])
for line in list2check:
if (line[1] not in rankmatchlist) and (line[1] not in missmarklist):
missmarklist.append(line[1])
return missmarklist
def bacterialmonly(list2check):
outputlist=[]
for line in list2check:
if line[-1].split("\n")[0] == "concat" or line[-1].split("\n")[0] == "16s_reps_bac":
outputlist.append(line)
return outputlist
#Sort lines of inputfile into lists to be used by prob_check
list2=[]
for line in list1:
list2.append(line.split("\t"))
inputfile.close()
#print(list2[0])
list2.pop(0)
#run prob_check on the list
prob_list=prob_check(list2)
#print prob_list
#run match-check on each level
taxon_ranks=['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for rank in taxon_ranks:
if len(prob_list) != 0:
if concatonly == True:
prob_list=bacterialmonly(prob_list)
norankmlist=checkrank(prob_list, rank)
prob_list=mremover(prob_list,norankmlist)
matchTF, mismlist =match_check(prob_list, rank)
if len(mismlist) != 0:
prob_list=mremover(prob_list, mismlist)
trueMatch(matchTF)