-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtic_env.py
341 lines (291 loc) · 11.5 KB
/
tic_env.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# import gym
import numpy as np
import random
class TictactoeEnv:
'''
Description:
Classical Tic-tac-toe game for two players who take turns marking the spaces in a three-by-three grid with X or O.
The player who succeeds in placing three of their marks in a horizontal, vertical, or diagonal row is the winner.
The game is played by two players: player 'X' and player 'O'. Player 'x' moves first.
The grid is represented by a 3x3 numpy array, with value in {0, 1, -1}, with corresponding values:
0 - place unmarked
1 - place marked with X
-1 - place marked with O
The game environment will recieve movement from two players in turn and update the grid.
self.step:
recieve the movement of the player, update the grid
The action space is [0-8], representing the 9 positions on the grid.
The reward is 1 if you win the game, -1 if you lose, and 0 besides.
'''
def __init__(self):
self.grid = np.zeros((3,3))
self.end = False
self.winner = None
self.player2value = {'X': 1, 'O': -1}
self.num_step = 0
self.current_player = 'X' # By default, player 'X' goes first
def check_valid(self, position):
''' Check whether the current action is valid or not
'''
if self.end:
raise ValueError('This game has ended, please reset it!')
if type(position) is int:
position = (int(position / 3), position % 3)
elif type(position) is not tuple:
position = tuple(position)
return False if self.grid[position] != 0 else True
def step(self, position, print_grid=False):
''' Receive the movement from two players in turn and update the grid
'''
# check the position and value are valid or not
# position should be a tuple like (0, 1) or int [0-8]
if self.end:
raise ValueError('This game has ended, please reset it!')
if type(position) is int:
position = (int(position / 3), position % 3)
elif type(position) is not tuple:
position = tuple(position)
if self.grid[position] != 0:
raise ValueError('There is already a chess on position {}.'.format(position))
# place a chess on the position
self.grid[position] = self.player2value[self.current_player]
# update
self.num_step += 1
self.current_player = 'X' if self.num_step % 2 == 0 else 'O'
# check whether the game ends or not
self.checkEnd()
if print_grid:
self.render()
return self.grid.copy(), self.end, self.winner
def get_current_player(self):
return self.current_player
def checkEnd(self):
# check rows and cols
if np.any(np.sum(self.grid, axis=0) == 3) or np.any(np.sum(self.grid, axis=1) == 3):
self.end = True
self.winner = 'X'
elif np.any(np.sum(self.grid, axis=0) == -3) or np.any(np.sum(self.grid, axis=1) == -3):
self.end = True
self.winner = 'O'
# check diagnols
elif self.grid[[0,1,2],[0,1,2]].sum() == 3 or self.grid[[0,1,2],[2,1,0]].sum() == 3:
self.end = True
self.winner = 'X'
elif self.grid[[0,1,2],[0,1,2]].sum() == -3 or self.grid[[0,1,2],[2,1,0]].sum() == -3:
self.end = True
self.winner = 'O'
# check if all the positions are filled
elif (self.grid == 0).sum() == 0:
self.end = True
self.winner = None # no one wins
else:
self.end = False
self.winner = None
def reset(self):
# reset the grid
self.grid = np.zeros((3,3))
self.end = False
self.winner = None
self.num_step = 0
self.current_player = 'X'
return self.grid.copy(), self.end, self.winner
def observe(self):
return self.grid.copy(), self.end, self.winner
def reward(self, player='X'):
if self.end:
if self.winner is None:
return 0
else:
return 1 if player == self.winner else -1
else:
return 0
def render(self):
# print current grid
value2player = {0: '-', 1: 'X', -1: 'O'}
for i in range(3):
print('|', end='')
for j in range(3):
print(value2player[int(self.grid[i,j])], end=' ' if j<2 else '')
print('|')
print()
class OptimalPlayer:
'''
Description:
A class to implement an epsilon-greedy optimal player in Tic-tac-toe.
About optimial policy:
There exists an optimial policy for game Tic-tac-toe. A player ('X' or 'O') can win or at least draw with optimial strategy.
See the wikipedia page for details https://en.wikipedia.org/wiki/Tic-tac-toe
In short, an optimal player choose the first available move from the following list:
[Win, BlockWin, Fork, BlockFork, Center, Corner, Side]
Parameters:
epsilon: float, in [0, 1]. This is a value between 0-1 that indicates the
probability of making a random action instead of the optimal action
at any given time.
'''
def __init__(self, epsilon=0.2, player='X'):
self.epsilon = epsilon
self.player = player # 'x' or 'O'
def set_player(self, player = 'X', j=-1):
self.player = player
if j != -1:
self.player = 'X' if j % 2 == 0 else 'O'
def empty(self, grid):
'''return all empty positions'''
avail = []
for i in range(9):
pos = (int(i/3), i % 3)
if grid[pos] == 0:
avail.append(pos)
return avail
def center(self, grid):
'''
Pick the center if its available,
if it's the first step of the game, center or corner are all optimial.
'''
if np.abs(grid).sum() == 0:
# first step of the game
return [(1, 1)] + self.corner(grid)
return [(1, 1)] if grid[1, 1] == 0 else []
def corner(self, grid):
''' Pick empty corners to move '''
corner = [(0, 0), (0, 2), (2, 0), (2, 2)]
cn = []
# First, pick opposite corner of opponent if it's available
for i in range(4):
if grid[corner[i]] == 0 and grid[corner[3 - i]] != 0:
cn.append(corner[i])
if cn != []:
return cn
else:
for idx in corner:
if grid[idx] == 0:
cn.append(idx)
return cn
def side(self, grid):
''' Pick empty sides to move'''
rt = []
for idx in [(0, 1), (1, 0), (1, 2), (2, 1)]:
if grid[idx] == 0:
rt.append(idx)
return rt
def win(self, grid, val=None):
''' Pick all positions that player will win after taking it'''
if val is None:
val = 1 if self.player == 'X' else -1
towin = []
# check all positions
for pos in self.empty(grid):
grid_ = np.copy(grid)
grid_[pos] = val
if self.checkWin(grid_, val):
towin.append(pos)
return towin
def blockWin(self, grid):
''' Find the win positions of opponent and block it'''
oppon_val = -1 if self.player == 'X' else 1
return self.win(grid, oppon_val)
def fork(self, grid, val=None):
''' Find a fork opportunity that the player will have two positions to win'''
if val is None:
val = 1 if self.player == 'X' else -1
tofork = []
# check all positions
for pos in self.empty(grid):
grid_ = np.copy(grid)
grid_[pos] = val
if self.checkFork(grid_, val):
tofork.append(pos)
return tofork
def blockFork(self, grid):
''' Block the opponent's fork.
If there is only one possible fork from opponent, block it.
Otherwise, player should force opponent to block win by making two in a row or column
Amomg all possible force win positions, choose positions in opponent's fork in prior
'''
oppon_val = -1 if self.player == 'X' else 1
oppon_fork = self.fork(grid, oppon_val)
if len(oppon_fork) <= 1:
return oppon_fork
# force the opponent to block win
force_blockwin = []
val = 1 if self.player == 'X' else -1
for pos in self.empty(grid):
grid_ = np.copy(grid)
grid_[pos] = val
if np.any(np.sum(grid_, axis=0) == val*2) or np.any(np.sum(grid_, axis=1) == val*2):
force_blockwin.append(pos)
force_blockwin_prior = []
for pos in force_blockwin:
if pos in oppon_fork:
force_blockwin_prior.append(pos)
return force_blockwin_prior if force_blockwin_prior != [] else force_blockwin
def checkWin(self, grid, val=None):
# check whether the player corresponding to the val will win
if val is None:
val = 1 if self.player == 'X' else -1
target = 3 * val
# check rows and cols
if np.any(np.sum(grid, axis=0) == target) or np.any(np.sum(grid, axis=1) == target):
return True
# check diagnols
elif grid[[0,1,2],[0,1,2]].sum() == target or grid[[0,1,2],[2,1,0]].sum() == target:
return True
else:
return False
def checkFork(self, grid, val=None):
# check whether the player corresponding to the val will fork
if val is None:
val = 1 if self.player == 'X' else -1
target = 2 * val
# check rows and cols
rows = (np.sum(grid, axis=0) == target).sum()
cols = (np.sum(grid, axis=1) == target).sum()
diags = (grid[[0,1,2],[0,1,2]].sum() == target) + (grid[[0,1,2],[2,1,0]].sum() == target)
if (rows + cols + diags) >= 2:
return True
else:
return False
def randomMove(self, grid):
""" Chose a random move from the available options. """
avail = self.empty(grid)
return avail[random.randint(0, len(avail)-1)]
def act(self, grid, **kwargs):
"""
Goes through a hierarchy of moves, making the best move that
is currently available each time (with probabitity 1-self.epsilon).
A touple is returned that represents (row, col).
"""
# whether move in random or not
if random.random() < self.epsilon:
return self.randomMove(grid)
### optimial policies
# Win
win = self.win(grid)
if win != []:
return win[random.randint(0, len(win)-1)]
# Block win
block_win = self.blockWin(grid)
if block_win != []:
return block_win[random.randint(0, len(block_win)-1)]
# Fork
fork = self.fork(grid)
if fork != []:
return fork[random.randint(0, len(fork)-1)]
# Block fork
block_fork = self.blockFork(grid)
if block_fork != []:
return block_fork[random.randint(0, len(block_fork)-1)]
# Center
center = self.center(grid)
if center != []:
return center[random.randint(0, len(center)-1)]
# Corner
corner = self.corner(grid)
if corner != []:
return corner[random.randint(0, len(corner)-1)]
# Side
side = self.side(grid)
if side != []:
return side[random.randint(0, len(side)-1)]
# random move
return self.randomMove(grid)