-
Notifications
You must be signed in to change notification settings - Fork 11
/
DeterministicFrozenNegReward.py
93 lines (82 loc) · 3.01 KB
/
DeterministicFrozenNegReward.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import deeprl_hw1.lake_envs as lake_env
import gym
import time
import seaborn
from tabulate import tabulate
import matplotlib.pyplot as plt
from deeprl_hw1.rlvaliterchngd import *
def run_policy(env,gamma,policy):
initial_state = env.reset()
#env.render()
time.sleep(1) # just pauses so you can see the output
total_reward = 0
num_steps = 0
current_state=initial_state
while True:
nextstate, reward, is_terminal, debug_info = env.step(policy[current_state])
#env.render()
total_reward += math.pow(gamma,num_steps)*reward
num_steps += 1
if is_terminal:
break
current_state=nextstate
time.sleep(1)
return total_reward, num_steps
grid=4
envname='Deterministic-4x4-neg-reward-FrozenLake-v0'
env = gym.make(envname)
env.render()
gamma=0.16
# print "Executing Policy Iteration"
# start_time=time.time()
# policy, value_func, policy_iters, val_iters= policy_iteration(env,gamma)
# print "Total time taken: "+str((time.time()-start_time))
# print "Total Policy Improvement Steps: "+str(policy_iters)
# print "Total Policy Evaluation Steps: "+str(val_iters)
# print "Policy:"
# policy_str=print_policy(policy,lake_env.action_names)
# ps=[]
# for elem in policy_str:
# ps.append(elem[0])
# reshaped_policy=np.reshape(ps,(grid,grid))
# print tabulate(reshaped_policy,tablefmt='latex')
# f, ax = plt.subplots(figsize=(11, 9))
# cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
# reshaped=np.reshape(value_func,(grid,grid))
# seaborn.heatmap(reshaped, cmap=cmap, vmax=1.1,
# square=True, xticklabels=grid+1, yticklabels=grid+1,
# linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
# plt.savefig('1c.png',bbox_inches='tight')
# np.savetxt('1gpolicy.csv',reshaped,delimiter=',')
print "Executing Value Iteration"
start_time=time.time()
value_function,value_iters=value_iteration(env,gamma)
print "Total time taken: "+str((time.time()-start_time))
print "Total Value Iteration Steps: "+str(value_iters)
print "Policy:"
policy=value_function_to_policy(env,gamma,value_function)
policy_str=print_policy(policy,lake_env.action_names)
ps=[]
for elem in policy_str:
ps.append(elem[0])
reshaped_policy=np.reshape(ps,(grid,grid))
print tabulate(reshaped_policy,tablefmt='latex')
f, ax = plt.subplots(figsize=(11, 9))
cmap = seaborn.diverging_palette(220, 10, as_cmap=True)
reshaped=np.reshape(value_function,(grid,grid))
seaborn.heatmap(reshaped, cmap=cmap, vmax=5,
square=True, xticklabels=grid+1, yticklabels=grid+1,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.savefig(envname+'.png',bbox_inches='tight')
np.savetxt(envname+'_2cvalue.csv',reshaped,delimiter=',')
# total_cum_reward=0
# maxn=5
# start_time=time.time()
# for n in range(maxn):
# cum_reward,nsteps=run_policy(env,gamma,policy)
# total_cum_reward+=cum_reward
# if n%1==0: print "Done "+str(n)
# print ("Time: "+str((time.time()-start_time)/60))
#
# print "Average Cumulative Reward: "+str((total_cum_reward/maxn))
# print "No. of steps: "+str(nsteps)