-
Notifications
You must be signed in to change notification settings - Fork 0
/
mrslpred_motif.py
114 lines (85 loc) · 3.72 KB
/
mrslpred_motif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
import pandas as pd
import pickle
import os
import Bio
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
import glob
from Bio import SeqIO
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--file", "-f", type=str, help='Path to fasta file', required=True)
parser.add_argument("--th1", "-t1", type=float, help='Threshold for Ribosome', default=0.3079)
parser.add_argument("--th2", "-t2", type=float, help='Threshold for Cytosol', default=0.1468)
parser.add_argument("--th3", "-t3", type=float, help='Threshold for Endoplasmic Reticulum', default=0.1156)
parser.add_argument("--th4", "-t4", type=float, help='Threshold for Membrane', default=0.1958)
parser.add_argument("--th5", "-t5", type=float, help='Threshold for Nucleus', default=0.7028)
parser.add_argument("--th6", "-t6", type=float, help='Threshold for Exosome', default=0.9961)
parser.add_argument("--output", "-o", type=str, help='Path to output', required=True)
nf_path = os.path.dirname(__file__)
args = parser.parse_args()
with open(nf_path + '/xgboost_final.pkl', 'rb') as f:
model = pickle.load(f)
fasta_loc = args.file
th1 = args.th1
th2 = args.th2
th3 = args.th3
th4 = args.th4
th5 = args.th5
th6 = args.th6
output_loc = args.output
exec_string = "python " + nf_path + "/Nfeature_DNA.py -i " + fasta_loc + " -ft CDK -k 3 -o " + output_loc + "/CDK3.csv"
os.system(exec_string)
print(exec_string)
exec_string = "python " + nf_path + "/Nfeature_DNA.py -i " + fasta_loc + " -ft RDK -k 4 -o " + output_loc + "/RDK4.csv"
os.system(exec_string)
# In[103]:
df1 = pd.read_csv(output_loc + "/CDK3.csv")
df1.drop(columns = 'Sequence_ID', inplace=True)
df2 = pd.read_csv(output_loc + "/RDK4.csv")
df2.drop(columns = 'Sequence_ID', inplace=True)
frames = [df1, df2]
df3 = pd.concat(frames, axis=1, ignore_index=True)
df3.shape
y_pred_proba = model.predict_proba(np.array(df3))
y=[]
for i in range(6):
x = y_pred_proba[i]
y.append(x[:,1])
y_pred = np.transpose(np.vstack(y))
# In[106]:
motifs_loc = glob.glob(nf_path + "/motifs/*", )
index = [int(j.split('.')[0]) for j in [i.split('_')[-1] for i in motifs_loc]]
motifs_loc_modified = motifs_loc[:]
for i in range(len(index)):
motifs_loc_modified[index[i]] = motifs_loc[i]
merci_motifs = []
for i in range(len(motifs_loc_modified)):
with open(motifs_loc_modified[i]) as f:
merci_motifs.append([line.rstrip() for line in f])
sequences = [str(seq_record.seq) for seq_record in SeqIO.parse(fasta_loc, "fasta")]
identifiers = [seq_record.id for seq_record in SeqIO.parse(fasta_loc, "fasta")]
motif_occurences = []
for i in range(len(merci_motifs)):
x = merci_motifs[i]
dummy_list = []
for j in range(len(x)):
dummy_list.append([int(merci_motifs[i][j] in k) for k in sequences])
combined_scores = np.array(dummy_list)
one_count = np.count_nonzero(combined_scores, axis=0)
one_count[one_count != 0] = 1
motif_occurences.append(one_count)
y_pred_motif = y_pred + np.transpose(np.array(motif_occurences))
y_pred_motif[y_pred_motif > 1] = 1
thresholds = [th1, th2, th3, th4, th5, th6]
prob_df = pd.DataFrame(np.array(y_pred_motif), columns = ['Ribosome', 'Cytosol', 'ER', 'Membrane', 'Nucleus', 'Exosome'])
prob_df_copy = prob_df.copy()
prob_df_copy.insert(loc = 0, column = 'Seq ID', value = identifiers)
for i in range(6):
x = prob_df.iloc[:,i]
prob_df.iloc[:,i][x > thresholds[i]] = 'Yes'
prob_df.iloc[:,i][x <= thresholds[i]] = 'No'
prob_df.insert(loc = 0, column = 'Seq ID', value = identifiers)
prob_df_copy.to_csv(output_loc +'/final_prob_prediction.csv', header=True, index=False)
prob_df.to_csv(output_loc +'/final_prediction.csv', header=True, index=False)