-
Notifications
You must be signed in to change notification settings - Fork 1
/
cartpole_using_DQN.py
133 lines (113 loc) · 4.88 KB
/
cartpole_using_DQN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# This script makes use of OpenAI gym to train on the cartpole game.
# Description of Game:
# A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
# The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright,
# and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that
# the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or
# the cart moves more than 2.4 units from the center.
# source : https://gym.openai.com/envs/CartPole-v1/
# Parameters:
# episodes - a number of games we want the agent to play.
# gamma - aka decay or discount rate, to calculate the future discounted reward.
# epsilon - aka exploration rate, this is the rate in which an agent randomly decides its action rather than prediction.
# epsilon_decay - we want to decrease the number of explorations as it gets good at playing games.
# epsilon_min - we want the agent to explore at least this amount.
#!pip install gym
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
### Inspired from the post of keon.io/deep-q-learning/
episodes = 1000
'''
By defining memory, we make sure that the state,action.reward and next_state
are remembered, as the neural network in DQN tends to forget them after each
iteration.
'''
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.gamma = 0.95 # discount rate
self.epsilon = 1.0 # exploration rate
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.learning_rate = 0.001
self.model = self._build_model()
def _build_model(self):
'''
Neural Network for DQN
'''
model = Sequential()
model.add(Dense(24, input_dim=self.state_size, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(self.action_size, activation='linear'))
model.compile(loss='mse',
optimizer=Adam(lr=self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
'''
Keep appending the memory
'''
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
'''
The agent will select at first it's action at random because
it is better for the agent to try all kinds of things before
it starts to see the patterns.
'''
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
act_values = self.model.predict(state)
return np.argmax(act_values[0]) # argmax picks the highest value among
# the two values in act_values eg [0.67, 0.04]
def replay(self, batch_size):
'''
Trains the neural net with experience in the memory.
We need to maximise the rewards in the long run, so we define gamma/discount
rate through which the agent will learn to maximise the discounted future
award in the long run.
'''
minibatch = random.sample(self.memory, batch_size)
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = (reward + self.gamma *
np.amax(self.model.predict(next_state)[0]))
target_f = self.model.predict(state)
target_f[0][action] = target
self.model.fit(state, target_f, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def load(self, name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)
if __name__ == "__main__":
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
for e in range(episodes):
state = env.reset()
state = np.reshape(state, [1, state_size])
for time in range(500):
env.render()
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
reward = reward if not done else -10
next_state = np.reshape(next_state, [1, state_size])
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
print("Episode: {}/{}, score: {}, e: {:.2}"
.format(e, episodes, time, agent.epsilon))
break
if len(agent.memory) > batch_size:
agent.replay(batch_size)