-
Notifications
You must be signed in to change notification settings - Fork 0
/
agents.py
133 lines (115 loc) · 5.08 KB
/
agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from collections import defaultdict
import numpy as np
from game import is_terminal
import csv
import pandas as pd
class QLearningAgent:
# pretty much based off of https://github.com/kamenbliznashki/sutton_barto/blob/master/agents.py
def __init__(self, gamma, alpha, eps, game):
# Q learning agent
self.gamma = gamma # discount factor
self.alpha = alpha # learning rate/Bellman error weight parameter
self.eps = eps # eps-greedy parameter
# will keep track of q values of all (s, a) encountered
self.q_values = defaultdict(float)
self.game = game
def reset(self):
self.q_values = defaultdict(float)
self.num_updates_done = 0
def q_value(self, state, action):
return self.q_values[(tuple(state), action)]
def get_best_action(self):
s = self.game.get_state()
action_set = self.game.get_action_set(s)
q_vals = [self.q_value(s, action) for action in action_set]
best_q = np.max(q_vals)
# will have a list of actions where estimated Q value is largest
best_actions = [a for i, a in enumerate(
action_set) if q_vals[i] == best_q]
idx = np.random.choice(len(best_actions))
action = best_actions[idx]
return action
def select_action(self):
action_set = self.game.get_action_set(self.game.get_state())
if action_set == []:
raise Exception('No action available to take--game is over')
else:
if np.random.uniform() < self.eps:
return self.get_best_action()
else:
# return any random action in action_set
idx = np.random.choice(len(action_set))
return action_set[idx]
def update(self, state, action, reward, next_state):
# Q(s, a) += (r + gamma * max_a(Q(s', a)) - Q(s, a)), will return new Q value
curr_q = self.q_value(state, action)
action_vals = [self.q_value(next_state, a)
for a in self.game.get_action_set(next_state)]
if len(action_vals) > 0:
next_q = np.max(action_vals)
else:
next_q = 0
self.q_values[(tuple(state), action)] += (self.alpha * (reward +
self.gamma * next_q - curr_q))
return self.q_values[(tuple(state), action)]
class ValueAgent:
def __init__(self, alpha, player_num, eps, key_lst=None, value_lst=None):
self.alpha = alpha
self.player_num = player_num
self.eps = eps
# assigns probability of winning from each of those given states
self.key_lst = key_lst
self.state_vals = {}
if value_lst is None:
self.reset()
else:
assert len(self.key_lst) == len(value_lst)
for j in range(len(key_lst)):
v = value_lst[j]
s = key_lst[j]
self.state_vals[s] = v
# have to make sure that the probability of winning from the winning states is 1, losing states is 0, other states is 0.5
# basically revolves around getting a list of all possible states of the game, and making a dict that assigns the probabilities accordingly
def reset(self):
self.state_vals = {}
for key in self.key_lst:
if is_terminal(key)[0]:
if is_terminal(key)[1] == self.player_num:
self.state_vals[key] = 1.0
else:
self.state_vals[key] = 0.0
else:
self.state_vals[key] = 1/2
def get_value(self, state):
return self.state_vals[tuple(state)]
def get_best_action(self, game):
s = game.get_state()
action_set = game.get_action_set(s)
next_state_vals = [self.state_vals[tuple(
game.next_state(s, action))] for action in action_set]
best_state_val = np.max(next_state_vals)
best_actions = [a for i, a in enumerate(
action_set) if next_state_vals[i] == best_state_val]
action_idx = np.random.choice(len(best_actions))
return best_actions[action_idx]
def select_action(self, game):
s = game.get_state()
action_set = game.get_action_set(s)
if np.random.uniform() < self.eps:
# epsilon chance you take the greedy action
return self.get_best_action(game)
else:
return action_set[np.random.choice(len(action_set))]
def update(self, state, next_state):
# V(s) <- V(s) + alpha * (V(s') - V(s))
curr_val = self.state_vals[tuple(state)]
next_val = self.state_vals[tuple(next_state)]
# print('prev state val: ' + str(curr_val))
# print('next state val: ' + str(next_val))
new_val = curr_val + self.alpha * (next_val - curr_val)
self.state_vals[tuple(state)] = new_val
def save_vals(self, filename):
dictionary = self.state_vals
df = pd.DataFrame({'States': list(dictionary.keys()),
'Values': list(dictionary.values())})
df.to_csv(f'{filename}.csv', index=False)