-
Notifications
You must be signed in to change notification settings - Fork 0
/
against_itself.py
60 lines (42 loc) · 1.53 KB
/
against_itself.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#######################################################################
#
# Retrieve the evaluated state-value pairs and
# let the trained agent play against itself.
# The moves are greedy, thus the game is deterministic.
#
#######################################################################
import tictactoe
import pickle
from utils import *
#### Load the state-value pairs ######
with open('./memory/state_action.txt', 'rb') as file:
state_action = pickle.load(file)
states = state_action[0]
x_values = state_action[1]
o_values = state_action[2]
#### Initialize the game ######
env = tictactoe.Game(debug=True)
state = env.reset()
done = False
while not done:
#### Determine the player ######
if sum(state) == 0:
label = 1
values = x_values
else:
label = -1
values = o_values
#### Evaluate the best action according to MDP ######
legal_moves = [i for i, value in enumerate(state) if value == 0]
legal_values = []
for action in legal_moves:
outcome_indices = possible_outcome_indices(states, state, action)
p = 1 / len(outcome_indices)
action_value = 0
for i in outcome_indices:
value = values[i]
action_value += p * value
legal_values.append(round(action_value, 3))
action = legal_moves[legal_values.index(max(legal_values))]
######## Take the game step based on the picked action #######
state, reward, done = env.step(action, label = label)