-
Notifications
You must be signed in to change notification settings - Fork 0
/
qlearning3.py
135 lines (115 loc) · 4.46 KB
/
qlearning3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from __future__ import division
import random
import matplotlib.pyplot as plt
grid=[[0 for i in range(4)] for j in range(3) ]
actions={}
converter={'u':0,'d':1,'l':2,'r':3}
probability=0.70
#Setting Up actions for each state
for i in range(3):
for j in range(4):
a=[]
a.append(-1) if i==0 else a.append('u') #Up
a.append(-1) if i==2 else a.append('d') #Down
a.append(-1) if j==0 else a.append('l') #Left
a.append(-1) if j==3 else a.append('r') #Right
actions[str(i)+str(j)]=a # Link actions with every state
actions['11']=[-1,-1,-1,-1]
#Set Up Rewards
rewards=[[0,0,0,0] for i in range(12)]
rewards[4][3]=100
rewards[6][2]=100
rewards[1][1]=100
rewards[9][0]=100
#Set Up Visit
visit=[[0,0,0,0] for i in range(12)]
#Set Up New rewards Table(Expected Rewards)
nrewards=[[0,0,0,0] for i in range(12)]
gamma= 0.9
iterations=20000
qtable=[[-1 for i in range(4)] for i in range(12)]
dqsum=[]
def qlearning():
prevqsum = 0
currqsum = 0
for i in range(iterations):
state=random.randint(0,11)
while state!=5: #(For GoalState6(i-1 standard in array)))
# print state
row=int(state/4)
col=int(state%4)
next_actions=actions[str(row) +str(col)]
plannedAction=next_actions[random.randint(0,3)]
while plannedAction==-1:
plannedAction = next_actions[random.randint(0, 3)]
fstate = future_state(state, plannedAction)
visit[state][converter[plannedAction]]+=1
alpha = 1 / (1 + visit[state][converter[plannedAction]])
randomValue=random.uniform(0,1)
if randomValue<probability: #For Desired Action
reward = rewards[state][converter[plannedAction]]
nrewards[state][converter[plannedAction]] += rewards[state][converter[plannedAction]]
old_qsa = qtable[state][converter[plannedAction]]
new_qsa = (1 - alpha) * old_qsa + alpha * (reward + gamma * findmaxq(state,plannedAction))
qtable[state][converter[plannedAction]] = new_qsa
else: #For Undesired Action
randomAction=next_actions[random.randint(0, 3)]
while randomAction==plannedAction or randomAction==-1:
randomAction = next_actions[random.randint(0, 3)]
reward = rewards[state][converter[randomAction]]
nrewards[state][converter[plannedAction]] += rewards[state][converter[randomAction]]
old_qsa = qtable[state][converter[plannedAction]]
new_qsa = (1 - alpha) * old_qsa + alpha * (reward + gamma * findmaxq(state, randomAction))
qtable[state][converter[plannedAction]] = new_qsa
state=fstate
prevqsum=currqsum
currqsum=0
for i in range(12):
for j in range(4):
if qtable[i][j]!=-1:
currqsum+=qtable[i][j]
dqsum.append(abs(currqsum-prevqsum))
plt.plot(dqsum)
plt.ylabel("Delta Qsum")
plt.xlabel("Iterations")
plt.show()
print "QTable"
print '%15s %15s %15s %15s %15s' % ("State", "Up", "Down", "Left", "Right")
for i in range(12):
print '%15s' % str(i + 1),
for j in range(4):
if qtable[i][j] != -1:
print '%15.7f' % qtable[i][j],
else:
print '%15s' % str(-1),
print ''
print "Expected Rewards"
print '%20s %20s %20s %20s %20s' % ("State", "Up", "Down", "Left", "Right")
showstates=[1,4,6,9]
for i in range(12):
if i in showstates:
print '%20s %20s %20s %20s %20s' % (
str(i + 1), str((nrewards[i][0]/visit[i][0]) if nrewards[i][0]!=0 else -1), str((nrewards[i][1]/visit[i][1]) if nrewards[i][1]!=0 else -1),
str((nrewards[i][2]/visit[i][2]) if nrewards[i][2]!=0 else -1), str((nrewards[i][3]/visit[i][3])) if nrewards[i][3]!=0 else -1)
def findmaxq(state,action):
fstate=future_state(state,action)
row=int(fstate/4)
col=int(fstate%4)
future_actions=actions[str(row) +str(col)]
m=0
for i in future_actions:
if i!=-1:
m=max(m,qtable[fstate][converter[i]])
return m
def future_state(state,action):
fstate=0
if action=='u':
fstate=state-4
elif action=='d':
fstate=state+4
elif action=='l':
fstate=state-1
elif action=='r':
fstate=state+1
return fstate
qlearning()