-
Notifications
You must be signed in to change notification settings - Fork 822
/
trains.py
294 lines (245 loc) · 11.4 KB
/
trains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Author: kerlomz <kerlomz@gmail.com>
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()
try:
gpus = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
except Exception as e:
print(e, "No available gpu found.")
# from tensorflow.python.platform.build_info import build_info
import core
import utils
import utils.data
import validation
from config import *
from tf_graph_util import convert_variables_to_constants
from PIL import ImageFile
# if build_info['cuda_version'] == '64_110':
ImageFile.LOAD_TRUNCATED_IMAGES = True
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
class Trains:
stop_flag: bool = False
"""训练任务的类"""
def __init__(self, model_conf: ModelConfig):
"""
:param model_conf: 读取工程配置文件
"""
self.model_conf = model_conf
self.validation = validation.Validation(self.model_conf)
def compile_graph(self, acc):
"""
编译当前准确率下对应的计算图为pb模型,准确率仅作为模型命名的一部分
:param acc: 准确率
:return:
"""
input_graph = tf.compat.v1.Graph()
tf.compat.v1.keras.backend.clear_session()
tf.compat.v1.reset_default_graph()
predict_sess = tf.compat.v1.Session(graph=input_graph)
tf.compat.v1.keras.backend.set_session(predict_sess)
with predict_sess.graph.as_default():
model = core.NeuralNetwork(
model_conf=self.model_conf,
mode=RunMode.Predict,
backbone=self.model_conf.neu_cnn,
recurrent=self.model_conf.neu_recurrent
)
model.build_graph()
model.build_train_op()
input_graph_def = predict_sess.graph.as_graph_def()
saver = tf.compat.v1.train.Saver(var_list=tf.compat.v1.global_variables())
tf.compat.v1.logging.info(tf.train.latest_checkpoint(self.model_conf.model_root_path))
saver.restore(predict_sess, tf.train.latest_checkpoint(self.model_conf.model_root_path))
output_graph_def = convert_variables_to_constants(
predict_sess,
input_graph_def,
output_node_names=['dense_decoded']
)
if not os.path.exists(self.model_conf.compile_model_path):
os.makedirs(self.model_conf.compile_model_path)
last_compile_model_path = (
os.path.join(self.model_conf.compile_model_path, "{}.pb".format(self.model_conf.model_name))
).replace('.pb', '_{}.pb'.format(int(acc * 10000)))
self.model_conf.output_config(target_model_name="{}_{}".format(self.model_conf.model_name, int(acc * 10000)))
with tf.io.gfile.GFile(last_compile_model_path, mode='wb') as gf:
gf.write(output_graph_def.SerializeToString())
def achieve_cond(self, acc, cost, epoch):
achieve_accuracy = acc >= self.model_conf.trains_end_acc
achieve_cost = cost <= self.model_conf.trains_end_cost
achieve_epochs = epoch >= self.model_conf.trains_end_epochs
over_epochs = epoch > 10000
if (achieve_accuracy and achieve_epochs and achieve_cost) or over_epochs:
return True
return False
def init_captcha_gennerator(self, ran_captcha):
path = self.model_conf.da_random_captcha['FontPath']
if not os.path.exists(path):
exception("Font path does not exist.", code=-6754)
items = os.listdir(path)
fonts = [os.path.join(path, item) for item in items]
ran_captcha.sample = NUMBER + ALPHA_UPPER + ALPHA_LOWER
ran_captcha.fonts_list = fonts
ran_captcha.check_font()
ran_captcha.rgb_r = [0, 255]
ran_captcha.rgb_g = [0, 255]
ran_captcha.rgb_b = [0, 255]
ran_captcha.fonts_num = [4, 8]
def train_process(self):
"""
训练任务
:return:
"""
# 输出重要的配置参数
self.model_conf.println()
# 定义网络结构
model = core.NeuralNetwork(
mode=RunMode.Trains,
model_conf=self.model_conf,
backbone=self.model_conf.neu_cnn,
recurrent=self.model_conf.neu_recurrent
)
model.build_graph()
tf.compat.v1.logging.info('Loading Trains DataSet...')
train_feeder = utils.data.DataIterator(
model_conf=self.model_conf, mode=RunMode.Trains
)
train_feeder.read_sample_from_tfrecords(self.model_conf.trains_path[DatasetType.TFRecords])
tf.compat.v1.logging.info('Loading Validation DataSet...')
validation_feeder = utils.data.DataIterator(
model_conf=self.model_conf, mode=RunMode.Validation
)
validation_feeder.read_sample_from_tfrecords(self.model_conf.validation_path[DatasetType.TFRecords])
tf.compat.v1.logging.info('Total {} Trains DataSets'.format(train_feeder.size))
tf.compat.v1.logging.info('Total {} Validation DataSets'.format(validation_feeder.size))
if validation_feeder.size >= train_feeder.size:
exception("The number of training sets cannot be less than the validation set.", )
if validation_feeder.size < self.model_conf.validation_batch_size:
exception("The number of validation sets cannot be less than the validation batch size.", )
num_train_samples = train_feeder.size
num_validation_samples = validation_feeder.size
if num_validation_samples < self.model_conf.validation_batch_size:
self.model_conf.validation_batch_size = num_validation_samples
tf.compat.v1.logging.warn(
'The number of validation sets is less than the validation batch size, '
'will use validation set size as validation batch size.'.format(validation_feeder.size))
num_batches_per_epoch = int(num_train_samples / self.model_conf.batch_size)
model.build_train_op(num_train_samples)
# 会话配置
sess_config = tf.compat.v1.ConfigProto(
allow_soft_placement=True,
# log_device_placement=False,
gpu_options=tf.compat.v1.GPUOptions(
allocator_type='BFC',
allow_growth=True, # it will cause fragmentation.
# per_process_gpu_memory_fraction=0.3
)
)
accuracy = 0
epoch_count = 1
if num_train_samples < 500:
save_step = 10
trains_validation_steps = 50
else:
save_step = 100
trains_validation_steps = self.model_conf.trains_validation_steps
sess = tf.compat.v1.Session(config=sess_config)
init_op = tf.compat.v1.global_variables_initializer()
sess.run(init_op)
saver = tf.compat.v1.train.Saver(var_list=tf.compat.v1.global_variables(), max_to_keep=3)
train_writer = tf.compat.v1.summary.FileWriter('logs', sess.graph)
# try:
checkpoint_state = tf.train.get_checkpoint_state(self.model_conf.model_root_path)
if checkpoint_state and checkpoint_state.model_checkpoint_path:
# 加载被中断的训练任务
saver.restore(sess, checkpoint_state.model_checkpoint_path)
tf.compat.v1.logging.info('Start training...')
# 进入训练任务循环
while 1:
start_time = time.time()
batch_cost = 65535
# 批次循环
for cur_batch in range(num_batches_per_epoch):
if self.stop_flag:
break
batch_time = time.time()
trains_batch = train_feeder.generate_batch_by_tfrecords(sess)
batch_inputs, batch_labels = trains_batch
feed = {
model.inputs: batch_inputs,
model.labels: batch_labels,
model.utils.is_training: True
}
summary_str, batch_cost, step, _, seq_len = sess.run(
[model.merged_summary, model.cost, model.global_step, model.train_op, model.seq_len],
feed_dict=feed
)
train_writer.add_summary(summary_str, step)
if step % save_step == 0 and step != 0:
tf.compat.v1.logging.info(
'Step: {} Time: {:.3f} sec/batch, Cost = {:.8f}, BatchSize: {}, Shape[1]: {}'.format(
step,
time.time() - batch_time,
batch_cost,
len(batch_inputs),
seq_len[0]
)
)
# 达到保存步数对模型过程进行存储
if step % save_step == 0 and step != 0:
saver.save(sess, self.model_conf.save_model, global_step=step)
# 进入验证集验证环节
if step % trains_validation_steps == 0 and step != 0:
batch_time = time.time()
validation_batch = validation_feeder.generate_batch_by_tfrecords(sess)
test_inputs, test_labels = validation_batch
val_feed = {
model.inputs: test_inputs,
model.labels: test_labels,
model.utils.is_training: False
}
dense_decoded, lr = sess.run(
[model.dense_decoded, model.lrn_rate],
feed_dict=val_feed
)
# 计算准确率
accuracy = self.validation.accuracy_calculation(
validation_feeder.labels,
dense_decoded,
)
log = "Epoch: {}, Step: {}, Accuracy = {:.4f}, Cost = {:.5f}, " \
"Time = {:.3f} sec/batch, LearningRate: {}"
tf.compat.v1.logging.info(log.format(
epoch_count,
step,
accuracy,
batch_cost,
time.time() - batch_time,
lr / len(validation_batch),
))
# 满足终止条件但尚未完成当前epoch时跳出epoch循环
if self.achieve_cond(acc=accuracy, cost=batch_cost, epoch=epoch_count):
break
# 满足终止条件时,跳出任务循环
if self.stop_flag:
break
if self.achieve_cond(acc=accuracy, cost=batch_cost, epoch=epoch_count):
# sess.close()
tf.compat.v1.keras.backend.clear_session()
sess.close()
self.compile_graph(accuracy)
tf.compat.v1.logging.info('Total Time: {} sec.'.format(time.time() - start_time))
break
epoch_count += 1
tf.compat.v1.logging.info('Total Time: {} sec.'.format(time.time() - start_time))
def main(argv):
project_name = argv[-1]
model_conf = ModelConfig(project_name=project_name)
Trains(model_conf).train_process()
tf.compat.v1.logging.info('Training completed.')
pass
if __name__ == '__main__':
# tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
tf.compat.v1.app.run()