-
Notifications
You must be signed in to change notification settings - Fork 1
/
rlagent.py
248 lines (190 loc) · 10.4 KB
/
rlagent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import tensorflow as tf
import simmodel
import numpy as np
class ExperienceReplayMemory(object):
"""
Implementation of the replay memory.
"""
def __init__(self, replay_memory_size):
self.replay_memory_size = replay_memory_size
self.consolidated_experience = EpisodeExperience()
def store_experience(self, episode_history):
self.consolidated_experience.consolidate(episode_history)
while self.consolidated_experience.size() > self.replay_memory_size:
self.consolidated_experience.purge()
def sample_transitions(self, batch_size):
"""
Randomly sample a batch of experiences from the replay memory.
:param batch_size:
:return:
"""
batch = EpisodeExperience()
if self.consolidated_experience.size() == 0:
raise Exception("No transitions for sampling.")
selected_indexes = np.arange(self.consolidated_experience.size())
np.random.shuffle(selected_indexes)
selected_indexes = selected_indexes[:batch_size]
for index in selected_indexes:
batch.observe_action_effects(state=self.consolidated_experience.states[index],
action=self.consolidated_experience.actions[index],
reward=self.consolidated_experience.rewards[index],
new_state=self.consolidated_experience.new_states[index])
return np.array(batch.states), np.array(batch.actions), np.array(batch.rewards), np.array(batch.new_states)
class EpisodeExperience(object):
def __init__(self):
self.states = []
self.actions = []
self.rewards = []
self.new_states = []
def size(self):
return len(self.states)
def observe_action_effects(self, state, action, reward, new_state):
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
self.new_states.append(new_state)
def consolidate(self, episode_history):
self.states += episode_history.states
self.actions += episode_history.actions
self.rewards += episode_history.rewards
self.new_states += episode_history.new_states
def purge(self):
self.states.pop(0)
self.actions.pop(0)
self.rewards.pop(0)
self.new_states.pop(0)
class DeepQLearner(object):
def __init__(self, name, input_number, hidden_units, logger, learning_rate=None, discount_factor=None,
counter_for_learning=None,
initial_epsilon=None, final_epsilon=None, decay_steps=None, replay_memory_size=None, global_step=None):
self.logger = logger
self.name = name
self.episode_experience = EpisodeExperience()
self.metric_catalogue = []
self.actions = [simmodel.CLEAN_ACTION, simmodel.SLOPPY_ACTION]
self.prediction_scope = self.name + '-prediction_network'
self.pred_states, self.pred_q_values = self.build_network(self.prediction_scope, input_number, hidden_units)
if learning_rate is not None:
self.initial_epsilon = initial_epsilon
self.final_epsilon = final_epsilon
self.decay_steps = decay_steps
self.replay_memory = ExperienceReplayMemory(replay_memory_size=replay_memory_size)
self.discount_factor = discount_factor
self.counter_for_learning = counter_for_learning
self.target_scope = self.name + '-target_network'
self.target_states, self.target_q_values = self.build_network(self.target_scope, input_number,
hidden_units)
self.train_target_q, self.train_actions, self.train_loss, self.train_operation = self.build_training_operation(
learning_rate=learning_rate, global_step=global_step, variable_scope=self.name + '-train')
def new_episode(self):
self.episode_experience = EpisodeExperience()
def observe_action_effects(self, state, action, reward, new_state):
self.episode_experience.observe_action_effects(state, action, reward, new_state)
def sample_transitions(self, batch_size):
self.logger.debug(self.name + "-Memory size: " + str(
self.replay_memory.consolidated_experience.size()) + " .Sampling: " + str(batch_size))
return self.replay_memory.sample_transitions(batch_size)
def store_experience(self):
self.logger.debug(self.name + "-Consolidating experience: " + str(self.episode_experience.size()))
self.replay_memory.store_experience(self.episode_experience)
def build_network(self, variable_scope, input_number, hidden_units):
"""
Creates the Deep Q-network. It takes as a input a state and produces a Q-value estimate per action as an output.
:param variable_scope:
:param input_number:
:param hidden_units:
:return:
"""
with tf.variable_scope(variable_scope):
states = tf.placeholder(tf.float32, shape=[None, input_number], name="state")
initializer = tf.variance_scaling_initializer()
hidden_layer_1 = tf.layers.dense(states, hidden_units, activation=tf.nn.elu, kernel_initializer=initializer,
name="hidden_1")
hidden_layer_2 = tf.layers.dense(hidden_layer_1, hidden_units, activation=tf.nn.elu,
kernel_initializer=initializer,
name="hidden_2")
outputs = tf.layers.dense(hidden_layer_2, len(self.actions), kernel_initializer=initializer,
name="q_values")
return states, outputs
def record_metric(self, metric_value):
self.metric_catalogue.append(metric_value)
def clear_metrics(self):
self.metric_catalogue = []
def get_current_epsilon(self, global_counter):
if hasattr(self, 'decay_steps'):
return max(self.final_epsilon,
self.initial_epsilon - (
self.initial_epsilon - self.final_epsilon) * global_counter / self.decay_steps)
else:
return None
def select_action(self, system_state, global_counter, session):
"""
Explores the system using an epsilon-greedy policy.
:param system_state:
:param global_counter:
:param session:
:return:
"""
prob_random = self.get_current_epsilon(global_counter)
q_values_from_pred = session.run(self.pred_q_values, feed_dict={self.pred_states: [system_state]})
if (prob_random is not None and np.random.random() < prob_random) or (
hasattr(self, 'counter_for_learning') and global_counter < self.counter_for_learning):
self.logger.debug(self.name + "-system state: %s prob_random: %.2f", str(system_state), prob_random)
action = np.random.randint(len(self.actions))
self.logger.debug(self.name + "-Behaving randomly: %s", str(action))
return action
else:
action = np.argmax(q_values_from_pred)
self.logger.debug(self.name + "-Behaving greedy: %s q_values_from_pred: %s", str(action),
str(q_values_from_pred))
return action
def build_training_operation(self, learning_rate, global_step, variable_scope):
"""
Generates the training operations for the Prediction DQN.
:param learning_rate:
:param global_step:
:param variable_scope:
:return:
"""
with tf.variable_scope(variable_scope):
train_target_q = tf.placeholder(tf.float32, [None], name="target_q_values")
train_actions = tf.placeholder(tf.int64, [None], name="actions")
actions_one_hot = tf.one_hot(train_actions, len(self.actions), 1.0, 0.0, name="actions_one_hot")
action_q_values = tf.reduce_sum(self.pred_q_values * actions_one_hot, axis=1, name="action_q_values")
delta = tf.square(train_target_q - action_q_values)
loss = tf.reduce_mean(delta, name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
return train_target_q, train_actions, loss, optimizer.minimize(loss, global_step=global_step)
def train(self, session, batch_size):
state_list, action_list, reward_list, next_state_list = self.sample_transitions(batch_size)
self.logger.debug(self.name + "-Starting transition target calculations...")
target_q_values = self.calculate_transition_targets(session=session,
reward_list=reward_list,
next_state_list=next_state_list)
self.logger.debug(self.name + "-Starting training ...")
self.train_network(session=session,
target_q_values=target_q_values,
action_list=action_list, state_list=state_list)
def calculate_transition_targets(self, session, reward_list, next_state_list):
next_q_values = session.run(self.target_q_values, feed_dict={self.target_states: next_state_list})
max_next_q_value = np.max(next_q_values, axis=1)
target_q_values = reward_list + self.discount_factor * max_next_q_value
return target_q_values
def train_network(self, session, target_q_values, action_list, state_list):
_, q_values, loss = session.run([self.train_operation, self.pred_q_values, self.train_loss], feed_dict={
self.train_target_q: target_q_values,
self.train_actions: action_list,
self.pred_states: state_list})
return q_values
def update_target_weights(self, session):
"""
Copies the Prediction DQN to the Target DQN
:param session:
:return:
"""
target_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.target_scope)
pred_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.prediction_scope)
for target_variable, pred_variable in zip(target_vars, pred_vars):
weight_from_pred = tf.placeholder(tf.float32, name="weight_from_pred")
session.run(target_variable.assign(weight_from_pred),
feed_dict={weight_from_pred: pred_variable.eval()})