-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgensim_lightgbm.py
154 lines (127 loc) · 4.95 KB
/
gensim_lightgbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
###################################start###################################
##加载包
import gc
import re
import sys
import time
import jieba
import os.path
import os
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import tensorflow as tf
import gensim
from gensim.models import Word2Vec
################################# data read ################################
#工作空间设置
data_path = 'E:/work/protein/'
os.chdir(data_path)#设置当前工作空间
print (os.getcwd())#获得当前工作目录
#数据读取
df_protein_train = pd.read_csv('df_protein_train.csv') #1653,2
df_protein_test = pd.read_csv('df_protein_test.csv') #414,2
protein_concat = pd.concat([df_protein_train,df_protein_test]) #2067,2
df_molecule = pd.read_csv('df_molecule.csv') #111216,20
df_affinity_train = pd.read_csv('df_affinity_train.csv') #165084,3
df_affinity_test = pd.read_csv('df_affinity_test_toBePredicted.csv')#41383,2
df_affinity_test['Ki'] = -11 #41383,3
data = pd.concat([df_affinity_train,df_affinity_test]) #206467,3
##############################################################################
########### feature ###########
##############################################################################
#1、Fingerprint分子指纹处理展开
feat = []
for i in range(0,len(df_molecule)):
feat.append(df_molecule['Fingerprint'][i].split(','))
feat = pd.DataFrame(feat)
feat = feat.astype('int')#111216,167
feat.columns=["Fingerprint_{0}".format(i) for i in range(0,167)]
feat["Molecule_ID"] = df_molecule['Molecule_ID']#111216,168
data = data.merge(feat, on='Molecule_ID', how='left')#206467,170
#2、df_molecule其他特征处理
feat = df_molecule.drop('Fingerprint',axis=1)#111216,19
data = data.merge(feat, on='Molecule_ID', how='left')#206467,188
#3、protein 蛋白质 词向量训练
n = 128
texts = [[word for word in re.findall(r'.{3}',document)]
for document in list(protein_concat['Sequence'])]
model = Word2Vec(texts,size=n,window=4,min_count=1,negative=3,
sg=1,sample=0.001,hs=1,workers=4)
vectors = pd.DataFrame([model[word] for word in (model.wv.vocab)])#向量
vectors['Word'] = list(model.wv.vocab)#向量对应的词
vectors.columns= ["vec_{0}".format(i) for i in range(0,n)]+["Word"]
wide_vec = pd.DataFrame()
result1=[]
aa = list(protein_concat['Protein_ID'])
for i in range(len(texts)):
result2=[]
for w in range(len(texts[i])):
result2.append(aa[i])
result1.extend(result2)
wide_vec['Id'] = result1#每个蛋白质对应的序号
result1=[]
for i in range(len(texts)):
result2=[]
for w in range(len(texts[i])):
result2.append(texts[i][w])
result1.extend(result2)
wide_vec['Word'] = result1#每个蛋白质中包含的氨基酸
del result1,result2
wide_vec = wide_vec.merge(vectors,on='Word', how='left')
wide_vec = wide_vec.drop('Word',axis=1)
wide_vec.columns = ['Protein_ID']+["vec_{0}".format(i) for i in range(0,n)]#Protein_ID对应的向量
del vectors
name = ["vec_{0}".format(i) for i in range(0,n)]
feat = pd.DataFrame(wide_vec.groupby(['Protein_ID'])[name].agg('mean')).reset_index()#按均值聚合
feat.columns=["Protein_ID"]+["mean_ci_{0}".format(i) for i in range(0,n)]
data = data.merge(feat, on='Protein_ID', how='left')
#################################### lgb ############################
train_feat = data[data['Ki']> -11].fillna(0)#可考虑其他
test_feat = data[data['Ki']<=-11].fillna(0)#
label_x = train_feat['Ki']
label_y = test_feat['Ki']
submission = test_feat[['Protein_ID','Molecule_ID']]
len(test_feat)
train_feat = train_feat.drop('Ki',axis=1)
test_feat = test_feat.drop('Ki',axis=1)
train_feat = train_feat.drop('Protein_ID',axis=1)
test_feat = test_feat.drop('Protein_ID',axis=1)
train_feat = train_feat.drop('Molecule_ID',axis=1)
test_feat = test_feat.drop('Molecule_ID',axis=1)
#lgb算法
train = lgb.Dataset(train_feat, label=label_x)
test = lgb.Dataset(test_feat, label=label_y,reference=train)
params = {
'boosting_type': 'gbdt',#gbdt
'objective': 'regression_l2',
'metric': 'l2',
#'objective': 'multiclass',
#'metric': 'multi_error',
#'num_class':5,
'min_child_weight': 3,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'learning_rate': 0.05,
'tree_method': 'exact',
'seed': 2017,
'nthread': 12,
'silent': True
}
num_round = 3000
gbm = lgb.train(params,
train,
num_round,
verbose_eval=50,
valid_sets=[train,test]
)
preds_sub = gbm.predict(test_feat)
#结果保存
nowTime=datetime.datetime.now().strftime('%m%d%H%M')#现在
name='mkd_'+nowTime+'.csv'
submission['Ki'] = preds_sub
submission.to_csv(name, index=False)