-
Notifications
You must be signed in to change notification settings - Fork 1
/
REINFORCE.py
115 lines (91 loc) · 4.66 KB
/
REINFORCE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# import template
import tensorflow as tf
import numpy as np
class Agent:
def __init__(self, state_size, num_action, reward_discount, learning_rate, exploration_strategy):
self.state_size = state_size
self.num_action = num_action
self.reward_discount = reward_discount
self.exploration_strategy = exploration_strategy
self.iter = 0
self.data_type = tf.float32
self.optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
self.avg_loss = tf.keras.metrics.Mean(name = 'loss')
self.model = self.build_model('model')
self.is_shutdown_explore = False
self.buffer = []
self.reset_buffer()
def build_model(self, name):
nn_input = tf.keras.Input(shape = self.state_size, dtype = self.data_type)
x = tf.keras.layers.Dense(units = 128)(nn_input)
x = tf.keras.layers.ReLU()(x)
x = tf.keras.layers.Dense(units = 128)(x)
x = tf.keras.layers.ReLU()(x)
x = tf.keras.layers.Dense(units = self.num_action)(x)
nn_output = tf.keras.activations.softmax(x)
model = tf.keras.Model(name = name, inputs = nn_input, outputs = nn_output)
return model
def predict(self, state):
return self.model(tf.convert_to_tensor(state, self.data_type))
def loss(self, states, actions, rewards, state_primes):
# Calculate accumulated reward with discount
np_rewards = np.array(rewards)
num_reward = np_rewards.shape[0]
discounts = np.logspace(1, num_reward, base = self.reward_discount, num = num_reward)
gt = np.zeros(num_reward)
for i in range(num_reward):
gt[i] = np.sum(np.multiply(np_rewards[i:], discounts[:num_reward - i]))
# Normalize the rewards
gt = (gt - tf.math.reduce_mean(gt)) / (tf.math.reduce_std(gt) + 1e-9)
predicts = self.predict(states)
# indice = tf.stack([tf.range(len(actions)), actions], axis = 1)
# predict_probs = tf.gather_nd(predicts, indice)
# predict_log_probs = tf.math.log(predict_probs)
# log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=predicts, labels=actions)
log_prob = tf.reduce_sum(tf.math.log(predicts) * tf.one_hot(actions, self.num_action), axis = 1)
# Compute loss as formular: loss = Sum of a trajectory(-gamma * log(Pr(s, a| Theta)) * Gt)
# Update model with a trajectory Every time.
return tf.reduce_sum(-log_prob * gt)
def get_metrics_loss(self):
return self.avg_loss.result()
def reset_metrics_loss(self):
self.avg_loss.reset_states()
def select_action(self, state):
# Assume using Epsilon Greedy Strategy
action = self.exploration_strategy.select_action()
# If the index of action (return value) is -1, choose the action with highest probability that model predict
if action == -1 or self.shutdown_explore == True:
# Predict the probability of each action(Stochastic Policy)
predict = self.predict([state])
# Pick then action with HIGHTEST probability
return tf.argmax(predict, axis = 1)[0]
else:
# If the index of action (return value) is != -1, act randomly
return action
def shutdown_explore(self):
self.is_shutdown_explore = True
def update(self):
with tf.GradientTape() as tape:
sample_states, sample_actions, sample_rewards, sample_state_primes = self.sample()
loss = self.loss(sample_states, sample_actions, sample_rewards, sample_state_primes)
# print("Loss: {}".format(loss))
# Update gradient
gradients = tape.gradient(loss, self.model.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
self.avg_loss.update_state(loss)
# Update exploration rate of Epsilon Greedy Strategy
self.exploration_strategy.update_epsilon()
self.iter += 1
return loss
def reset_buffer(self):
# Init & Reset buffer
# The buffer is used for Historical Replay / Trajectory Storing etc...
self.buffer = {'state': [], 'action': [], 'reward': [], 'state_prime': []}
def add_buffer(self, new_state, new_action, new_reward, new_state_prime):
self.buffer['state'].append(new_state)
self.buffer['action'].append(new_action)
self.buffer['reward'].append(new_reward)
self.buffer['state_prime'].append(new_state_prime)
def sample(self):
# Return whole trajectory
return self.buffer['state'], self.buffer['action'], self.buffer['reward'], self.buffer['state_prime']