-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_qlearning.py
63 lines (49 loc) · 1.85 KB
/
train_qlearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from auxFunctions import getState, createEmptyQTable, maxAction, save_obj
import gym
import random
import numpy as np
env = gym.make('MountainCar-v0')
env._max_episode_steps = 1000
# Create an empty Q-table
Q = createEmptyQTable()
# Hyperparameters
alpha = 0.1 # Learning Rate
gamma = 0.9 # Discount Factor
epsilon = 1 # e-Greedy
episodes = 50000 # number of episodes
score = 0
# Variable to keep track of the total score obtained
# at each episode to plot it later
total_score = np.zeros(episodes)
for i in range(episodes):
done = False
observation = env.reset()
state = getState(observation)
if i % 500 == 0:
print(f'episode: {i}, score: {score}, epsilon: {epsilon:0.3f}')
score = 0
while not done:
# e-Greedy strategy
# Explore random action with probability epsilon
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
# Take best action with probability 1-epsilon
else:
action = maxAction(Q, state)
# Observe next state based on
next_observation, reward, done, info = env.step(action)
next_state = getState(next_observation)
# Add reward to the score of the episode
score += reward
# Get next action
next_action = maxAction(Q, next_state)
# Update Q value for state and action given the bellman equation
Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
# Move to next state
state = next_state
# Save score for this episode
total_score[i] = score
# Reduce epsilon
epsilon = epsilon - 2/episodes if epsilon > 0.01 else 0.01
# Save Q-table as .pkl file
save_obj(Q, 'Q-table-Q-Learning')