-
Notifications
You must be signed in to change notification settings - Fork 0
/
decision_tree.py
87 lines (73 loc) · 2.18 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
###Introduction: This script uses DecisionTreeClassifier to make predictions based on pssm files and returns the accurancy after doing 5-fold cross validation. The outcomes of window size 3-25 will all be print out.
feature_string='VYWTSPFMKLIHGEQCDNRA'
raw_list=[0]*38
raw_list.insert(19,1)
dict_of_aa=dict()
dict_of_labels={'H':0,'S':1,'C':2,'NO':3}
i=0
for amino_acids in feature_string:
dict_of_aa[amino_acids]=raw_list[i:i+20]
i=i+1
for window_size in range(3,27,2):
half_ws=int(window_size/2)
ws_list=[]
key_pssm=0
pssm_list=[]
import glob
for filename in glob.glob('*.pssm'):
pssm_list.append(filename.strip('.fa.pssm'))
f=open(filename)
length_seq_list=[]
i=half_ws
for k in range(0,i,1):
length_seq_list.append([0.0]*20)
crazy=0
for line in f:
crazy=crazy+1
if crazy>=4:
if len(line)>=50:
frequency=line.split()
frequency_list=frequency[22:42]
freq_num_list=[(float (x))/100 for x in frequency_list]
length_seq_list.append(freq_num_list)
i=i+1
for m in range(i,i+half_ws,1):
length_seq_list.append([0.0]*20)
f.close()
##add window size
for why in range(half_ws,len(length_seq_list)-half_ws,1):
list_in_list=[]
for i in range(why-half_ws,why+half_ws+1,1):
list_in_list.extend(length_seq_list[i])
ws_list.append(list_in_list)
f.close()
##create label list
dict_homo70={}
label_list=[]
k=open('0309homo70_with_labels.txt')
n=0
for line in k:
if n%3==0:
a=line
dict_homo70[a]=0
if n%3==2:
dict_homo70[a]=line
n=n+1
for genename in pssm_list:
for key,value in dict_homo70.items():
if key.strip()==genename.strip():
label_list.extend(list(value.strip()))
k.close()
label_list2=[]
for label in label_list:
for key,value in dict_of_labels.items():
if key==label:
label_list2.append(value)
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=20)
scores = cross_val_score(clf, ws_list, label_list2, cv=5, verbose=40, n_jobs=-1)
print(scores)
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))
print ('number %s has been done'%window_size)