forked from carlacodes/atchekegroup1lunarlanding
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlunarlandingdraftcg_220722.py
185 lines (164 loc) · 7.97 KB
/
lunarlandingdraftcg_220722.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import io
import os
import glob
import torch
import pip
import base64
# import stable_baselines3
import numpy as np
import matplotlib.pyplot as plt
import stable_baselines3
import gym
from stable_baselines3 import DQN
from stable_baselines3.common.results_plotter import ts2xy, load_results
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_atari_env
import gym
from gym import spaces
from YoutubeCodeRepository.ReinforcementLearning.DeepQLearning import simple_dqn_torch_2020
from gym.wrappers import Monitor, RecordVideo
#
# # @title Plotting/Video functions
# from IPython.display import HTML
# from pyvirtualdisplay import Display
# from IPython import display as ipythondisplay
import torch
print(f"Is CUDA supported by this system?{torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")
import gym
cuda = torch.device('cuda') # Default CUDA device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# with torch.cuda.device(0):
nn_layers = [64,64] #This is the configuration of your neural network. Currently, we have two layers, each consisting of 64 neurons.
#If you want three layers with 64 neurons each, set the value to [64,64,64] and so on.
learning_rate = 0.001 #This is the step-size with which the gradient descent is carried out.
#Tip: Use smaller step-sizes for larger networks.
env = gym.make('LunarLander-v4', enable_wind=True, wind_power=15.0)
log_dir = "/tmp/gym2007/"
log_dir='C:/Users/carla/PycharmProjects/atchekegroup1lunarlanding/gym/'
os.makedirs(log_dir, exist_ok=True)
# Create environment
#You can also load other environments like cartpole, MountainCar, Acrobot. Refer to https://gym.openai.com/docs/ for descriptions.
#For example, if you would like to load Cartpole, just replace the above statement with "env = gym.make('CartPole-v1')".
env = stable_baselines3.common.monitor.Monitor(env, log_dir )
callback = EvalCallback(env,log_path = log_dir, deterministic=True) #For evaluating the performance of the agent periodically and logging the results.
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
net_arch=nn_layers)
model_old = DQN("MlpPolicy", env,policy_kwargs = policy_kwargs,
learning_rate=learning_rate,
batch_size=1, #for simplicity, we are not doing batch update.
buffer_size=1, #size of experience of replay buffer. Set to 1 as batch update is not done
learning_starts=1, #learning starts immediately!
gamma=0.99, #discount facto. range is between 0 and 1.
tau = 1, #the soft update coefficient for updating the target network
target_update_interval=1, #update the target network immediately.
train_freq=(1,"step"), #train the network at every step.
max_grad_norm = 10, #the maximum value for the gradient clipping
exploration_initial_eps = 0.9, #initial value of random action probability
exploration_fraction = 0.8, #fraction of entire training period over which the exploration rate is reduced
gradient_steps = 1, #number of gradient steps,
exploration_final_eps = 0.05,
# exploration_initial_eps = 1 # initial value of random action probability. Range is between 0 and 1.
# exploration_fraction = 0.5 # fraction of entire training period over which the exploration rate is reduced. Range is between 0 and 1.
# exploration_final_eps = 0.05 # (set by defualt) final value of random action probability. Range is between 0 and 1.
seed = 1, #seed for the pseudo random generators
device="cuda",
verbose=0) #Set verbose to 1 to observe training logs. We encourage you to set the verbose to 1.
model_test = DQN("MlpPolicy", env,policy_kwargs =policy_kwargs,
learning_rate=6.3e-4,
batch_size=128, #for simplicity, we are not doing batch update.
buffer_size=50000, #size of experience of replay buffer. Set to 1 as batch update is not done
learning_starts=0, #learning starts immediately!
gamma=0.99, #discount facto. range is between 0 and 1.
tau = 1, #the soft update coefficient for updating the target network
target_update_interval=250, #update the target network immediately.
train_freq=(4,"step"), #train the network at every step.
#max_grad_norm = 10, #the maximum value for the gradient clipping
exploration_initial_eps = 0.9, #initial value of random action probability
exploration_fraction = 0.8, #fraction of entire training period over which the exploration rate is reduced
gradient_steps = -1, #number of gradient steps,
exploration_final_eps = 0.1,
# exploration_initial_eps = 1 # initial value of random action probability. Range is between 0 and 1.
# exploration_fraction = 0.5 # fraction of entire training period over which the exploration rate is reduced. Range is between 0 and 1.
# exploration_final_eps = 0.05 # (set by defualt) final value of random action probability. Range is between 0 and 1.
seed = 1, #seed for the pseudo random generators
device="cuda",
verbose=1) #Set verbose to 1 to observe training logs. We encourage you to set the verbose to 1.
# You can also experiment with other RL algorithms like A2C, PPO, DDPG etc. Refer to https://stable-baselines3.readthedocs.io/en/master/guide/examples.html
#for documentation. For example, if you would like to run DDPG, just replace "DQN" above with "DDPG".
#
# for _ in range(1000):
# env.render()
# env.step(env.action_space.sample())
#
# env.close()
"""
Utility functions to enable video recording of gym environment
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""
#
# test_env = (gym.make("LunarLander-v4"))
# observation = test_env.reset()
# total_reward = 0
# while True:
# test_env.render()
# # for _ in range(1000):
# # env.render()
# test_env.step(env.action_space.sample())
# #
# # env.close()
# action, states = model_old.predict(observation, deterministic=True)
# observation, reward, done, info = test_env.step(action)
# total_reward += reward
# if done:
# break
#
# # print(total_reward)
# test_env.close()
#model_old.learn(total_timesteps=100000, log_interval=10, callback=callback)
model_test.learn(total_timesteps=100000, log_interval=10, callback=callback)
loadedparams = model_test.get_parameters()
x, y = ts2xy(load_results(log_dir), 'timesteps') # Organising the logged results in to a clean format for plotting.
plt.plot(x, y)
plt.ylim([-300, 300])
plt.xlabel('Timesteps')
plt.ylabel('Episode Rewards')
plt.title('Carl parameters model, trained on regular environment')
plt.show()
# run corresponding video with obstacle
env = (gym.make("LunarLander-v4"))
# env= Monitor(env, "./gym-results", force=True)
observation = env.reset()
env.render()
total_reward = 0
while True:
env.render()
action, _states = model_test.predict(observation, deterministic=True)
observation, reward, done, info = env.step(action)
if done:
break;
env.close()
# def wrap_env(env):
# env = Monitor(env, './video', force=True)
# env = RecordVideo(env, './video')
# return env
# new_env = wrap_env(gym.make("LunarLander-v4"))
# observation = new_env.reset()
# total_reward = 0
#
# while True:
# new_env.render()
# action, states = model_test.predict(observation, deterministic=True)
# observation, reward, done, info = new_env.step(action)
# total_reward += reward
# if done:
# break;
#
# # print(total_reward)
# new_env.close()