-
Notifications
You must be signed in to change notification settings - Fork 62
/
model_CNN_attention_accusation.py
96 lines (82 loc) · 3.13 KB
/
model_CNN_attention_accusation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.layers import Dense, Dropout, BatchNormalization
from keras.layers import GRU, MaxPooling1D, Bidirectional
import pandas as pd
import time
from keras.models import load_model
from evaluate import predict2both, predict2half, predict2top, f1_avg
from attention import attention
print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('accusation')
num_words = 80000
maxlen = 400
print('num_words = 80000, maxlen = 400')
# fact数据集
fact = np.load('./data_deal/big_fact_pad_seq_%d_%d.npy' % (num_words, maxlen))
fact_train, fact_test = train_test_split(fact, test_size=0.05, random_state=1)
del fact
# 标签数据集
labels = np.load('./data_deal/labels/big_labels_accusation.npy')
labels_train, labels_test = train_test_split(labels, test_size=0.05, random_state=1)
del labels
set_accusation = np.load('./data_deal/set/set_accusation.npy')
data_input = Input(shape=[maxlen])
word_vec = Embedding(input_dim=num_words + 1,
input_length=maxlen,
output_dim=512,
mask_zero=0,
name='Embedding')(data_input)
x = word_vec
x = Conv1D(filters=512, kernel_size=[3], strides=1, padding='same', activation='relu')(x)
x = attention(input=x, depth=512)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dense(1000, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(labels_train.shape[1], activation="sigmoid")(x)
model = Model(inputs=data_input, outputs=x)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
n_start = 1
n_end = 21
score_list1 = []
score_list2 = []
for i in range(n_start, n_end):
model.fit(x=fact_train, y=labels_train, batch_size=512, epochs=1, verbose=1)
model.save('./model/%d_%d/accusation/CNN_attention_epochs_%d.h5' % (num_words, maxlen, i))
y = model.predict(fact_test[:])
y1 = predict2top(y)
y2 = predict2half(y)
y3 = predict2both(y)
print('%s accu:' % i)
# 只取最高置信度的准确率
s1 = [(labels_test[i] == y1[i]).min() for i in range(len(y1))]
print(sum(s1) / len(s1))
# 只取置信度大于0.5的准确率
s2 = [(labels_test[i] == y2[i]).min() for i in range(len(y1))]
print(sum(s2) / len(s2))
# 结合前两个
s3 = [(labels_test[i] == y3[i]).min() for i in range(len(y1))]
print(sum(s3) / len(s3))
print('%s f1:' % i)
# 只取最高置信度的准确率
s4 = f1_avg(y_pred=y1, y_true=labels_test)
print(s4)
# 只取置信度大于0.5的准确率
s5 = f1_avg(y_pred=y2, y_true=labels_test)
print(s5)
# 结合前两个
s6 = f1_avg(y_pred=y3, y_true=labels_test)
print(s6)
score_list1.append([i,
sum(s1) / len(s1),
sum(s2) / len(s2),
sum(s3) / len(s3)])
score_list2.append([i, s4, s5, s6])
print(pd.DataFrame(score_list1))
print(pd.DataFrame(score_list2))
print('end', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))