-
Notifications
You must be signed in to change notification settings - Fork 0
/
reinforce.py
155 lines (121 loc) · 5.41 KB
/
reinforce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#This Python file aims at implementing a the REINFORCE Policy Gradient Approach to solve the CartPole-v0 task of the OpenAI Gym Environment
#Importing all the necessary frameworks
import numpy as np
import tensorflow as tf
import keras
from keras.layers import Input, Dense
from keras.models import Model
import keras.backend as k
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import random
import gym
#A function to keep track of average rewards
def running_reward_avg(rewards):
output=[]
for i in range(len(rewards)):
output.append(sum(rewards[:i])/(i+1))
return output
#Initializing the Agent class which controls the agents behaviour/improvement in the environment
class Agent(object):
def __init__(self,env,gamma=0.98,alpha=0.01):
self.env=env #The OpenAI Gym Environment
self.state_size=self.env.observation_space.shape[0] #Number of features describing each state in the environment
self.action_size=self.env.action_space.n #Number of features describing each action in the environment
self.gamma=gamma #Discount factor for future rewards
self.alpha=alpha #Learning rate during training
self.n_hl1=16 #Number of units in the first hidden layer of the network
self.n_hl2=16 #Number of units in the second hidden layer of the network
self.network1,self.network2=self.build_network() #Building the network that takes states as inputs, and stochastic probabilities as output
self.reward_history=[] #Reward history to keep track of rewards per episode
self.episode_lengths=[] #To keep track of the length of each episode
#A function to initialise/construct the neural network to calcuate stochastic action probabilities
def build_network(self):
inputs=Input(shape=[self.state_size])
reward=Input(shape=[1])
X=Dense(self.n_hl1, activation='relu')(inputs)
X=Dense(self.n_hl2, activation='relu')(X)
outputs=Dense(self.action_size, activation='softmax')(X)
model1=Model(inputs=[inputs,reward],outputs=outputs)
def custom_loss(y_pred,y_true):
probs=k.clip(y_pred,1e-10,1-1e-10)
return -k.mean(k.log(probs)*y_true)*reward
model1.compile(optimizer=Adam(learning_rate=self.alpha),loss=custom_loss)
model2=Model(inputs=inputs,outputs=outputs)
return model1,model2
#A function to calculate the discounted return for a particular timestep in an episode
def discounted_returns(self,rewards):
discounted_returns=np.zeros((len(rewards)))
current_return=0
for t in reversed(range(len(rewards))):
current_return=rewards[t]+self.gamma*current_return
discounted_returns[t]=current_return
return discounted_returns
##Choosing a greedy action most of the times (sometimes, a random action to promote exploration)
def choose_action(self,state,epsilon=0.2):
policy_output=self.network2.predict(state.reshape([1,self.state_size]))
if np.random.rand(1)>epsilon:
return np.argmax(policy_output[0])
else:
return np.random.randint(self.action_size)
#Updating/Fitting the agent's network (at the end of every episode)
def update_network(self,episode):
states=[]
actions=[]
rewards=[]
for i in range(len(episode)):
states.append(episode[i][0])
actions.append(episode[i][1])
rewards.append(episode[i][2])
states=np.array(states)
actions=np.eye(self.action_size)[np.array(actions)]
returns=self.discounted_returns(rewards)
self.network1.fit([states,returns],actions)
#A function to generate an episode
def generate_episode(self):
#A buffer to store information about each timestep of the episode
episode=[]
#Maintainging the reward buffer, and initialising the step count
reward_buffer=0
j=0
#Resetting the starting state of the episode
state_now=self.env.reset()
while j<1000:
#Choosing the greedy action, claiming the reward, and proceeding to the bext state
action=self.choose_action(state_now)
state_next,reward,done,_=self.env.step(action)
episode.append([state_now,action,reward])
#Updating the reward buffer
reward_buffer+=reward
j+=1
#If the episode is done, return the episode buffer, else go to the nect timestep
if done==True:
self.reward_history.append(reward_buffer)
self.episode_lengths.append(j)
return episode
else:
state_now=state_next
#Training the agent over a number of episodes
def train(self,num_episodes=5000):
#Iterating over episodes
for i in range(num_episodes):
#Generating an episode
episode=self.generate_episode()
#Updating the network using REINFORCE algorithm, after discounting the rewards
agent.update_network(episode)
#Keeping track of progress of successsive episodes (legth of episode, and average reward)
if (i+1)%100==0:
print("Length of Episode {} : {}".format(i+1,self.episode_lengths[i]))
print("Total reward claimed by the agent in episode {} : {}".format(i+1,self.reward_history[i]))
#Creating an environment, and an agent
env=gym.make("CartPole-v1")
agent=Agent(env)
#Training the agent using REINFORCE Algorithm of Policy Gradient Methods with the above mentioned parameters
agent.train()
#Plotting the results
fig, axs = plt.subplots(1,2)
axs[0].plot(running_reward_avg(agent.reward_history))
axs[0].set_title('Average Reward per Episode')
axs[1].plot(agent.episode_lengths, 'tab:orange')
axs[1].set_title('Episode_Length')
plt.show()