-
Notifications
You must be signed in to change notification settings - Fork 1
/
chess_student.py
363 lines (272 loc) · 13.8 KB
/
chess_student.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
import numpy as np
import matplotlib.pyplot as plt
from degree_freedom_queen import *
from degree_freedom_king1 import *
from degree_freedom_king2 import *
from features import *
from generate_game import *
from Q_values import *
size_board = 4
def main():
"""
Generate a new game
The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
do not cause any threats to each other.
s: a size_board x size_board matrix filled with zeros and three numbers:
1 = location of the King
2 = location of the Queen
3 = location fo the Enemy King
p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
number the colunm
p_k1: same as p_k2 but for the King
p_q1: same as p_k2 but for the Queen
"""
s, p_k2, p_k1, p_q1 = generate_game(size_board)
"""
Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left,
down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of
the board - 1
"""
possible_queen_a = (s.shape[0] - 1) * 8
"""
Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left,
down-right)
"""
possible_king_a = 8
# Total number of actions for Player 1 = actions of King + actions of Queen
N_a = possible_king_a + possible_queen_a
"""
Possible actions of the King
This functions returns the locations in the chessboard that the King can go
dfK1: a size_board x size_board matrix filled with 0 and 1.
1 = locations that the king can move to
a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1):
down, up, right, left, down-right, down-left, up-right, up-left
"""
dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
"""
Possible actions of the Queen
Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
"""
dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
"""
Possible actions of the Enemy King
Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
"""
dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
"""
Compute the features
x is a Nx1 vector computing a number of input features based on which the network should adapt its weights
with board size of 4x4 this N=50
"""
x = features(p_q1, p_k1, p_k2, dfK2, s, check)
N_in=np.shape(State)[x]
N_h=200
W1=np.random.uniform(-1,1,[N_in,N_h])/(N_in+N_h)
b1=np.zeros([N_h])
W2=np.random.uniform(-1,1,[N_h,N_a])/(N_in+N_h)
b2=np.zeros([N_a])
"""
Initialization
Define the size of the layers and initialization
FILL THE CODE
Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights
should be initialised according to a uniform distribution and rescaled by the total number of connections between
the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden
layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer
refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
should be initialized with zeros.
"""
#n_input_layer = 1 # Number of neurons of the input layer. TODO: Change this value
#n_hidden_layer = 200 # Number of neurons of the hidden layer
#n_output_layer = 1 # Number of neurons of the output layer. TODO: Change this value accordingly
"""
TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the
output layer according to the instructions. Define also the biases.
"""
# YOUR CODES ENDS HERE
# Network Parameters
epsilon_0 = 0.2 #epsilon for the e-greedy policy
beta = 0.00005 #epsilon discount factor
gamma = 0.85 #SARSA Learning discount factor
eta = 0.0035 #learning rate
N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw
### Training Loop ###
# Directions: down, up, right, left, down-right, down-left, up-right, up-left
# Each row specifies a direction,
# e.g. for down we need to add +1 to the current row and +0 to current column
map = np.array([[1, 0],
[-1, 0],
[0, 1],
[0, -1],
[1, 1],
[1, -1],
[-1, 1],
[-1, -1]])
# THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
# NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
# LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.
R_save = np.zeros([N_episodes, 1])
N_moves_save = np.zeros([N_episodes, 1])
# END OF SUGGESTIONS
for n in range(N_episodes):
epsilon_f = epsilon_0 / (1 + beta * n)
checkmate = 0 # 0 = not a checkmate, 1 = checkmate
draw = 0 # 0 = not a draw, 1 = draw
i = 1 # counter for movements
# Generate a new game
s, p_k2, p_k1, p_q1 = generate_game(size_board)
# Possible actions of the King
dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
# Possible actions of the Queen
dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
# Possible actions of the enemy king
dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
if n%50==0:
print(np.mean(R_save[:n]))
print(np.mean(N_moves_save[:n]))
while checkmate == 0 and draw == 0:
R = 0 # Reward
# Player 1
# Actions & allowed_actions
a = np.concatenate([np.array(a_q1), np.array(a_k1)])
allowed_a = np.where(a > 0)[0]
# Computing Features
x = features(p_q1, p_k1, p_k2, dfK2, s, check)
# FILL THE CODE
# Enter inside the Q_values function and fill it with your code.
# You need to compute the Q values as output of your neural
# network. You can change the input of the function by adding other
# data, but the input of the function is suggested.
Q, x1 = Q_values(x, W1, W2, bias_W1, bias_W2)
Qvalues=np.copy(Q[allowed_a])
rand_a=np.random.uniform(0,1)<epsilon
if rand_a==1:
a_agent=np.permute(allowed_a)[0]
else:
a=np.argmax(Qvalues)
a_agent=np.copy(allowed_a[a])
"""
YOUR CODE STARTS HERE
FILL THE CODE
Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
containing all the possible actions. Create a vector called a_agent that contains the index of the action
chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
"""
#a_agent = 1 # CHANGE THIS VALUE BASED ON YOUR CODE TO USE EPSILON GREEDY POLICY
#THE CODE ENDS HERE.
# Player 1 makes the action
if a_agent < possible_queen_a:
direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
steps = a_agent - direction * (size_board - 1) + 1
s[p_q1[0], p_q1[1]] = 0
mov = map[direction, :] * steps
s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
p_q1[0] = p_q1[0] + mov[0]
p_q1[1] = p_q1[1] + mov[1]
else:
direction = a_agent - possible_queen_a
steps = 1
s[p_k1[0], p_k1[1]] = 0
mov = map[direction, :] * steps
s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
p_k1[0] = p_k1[0] + mov[0]
p_k1[1] = p_k1[1] + mov[1]
# Compute the allowed actions for the new position
# Possible actions of the King
dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
# Possible actions of the Queen
dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
# Possible actions of the enemy king
dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
# Player 2
# Check for draw or checkmate
if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
# King 2 has no freedom and it is checked
# Checkmate and collect reward
checkmate = 1
R = 1 # Reward for checkmate
"""
FILL THE CODE
Update the parameters of your network by applying backpropagation and Q-learning. You need to use the
rectified linear function as activation function (see supplementary materials). Exploit the Q value for
the action made. You computed previously Q values in the Q_values function. Be careful: this is the last
iteration of the episode, the agent gave checkmate.
"""
delta=R-Q[a_agent]
delta_W2=eta*delta*x1
delta_W1=eta*np.outer(x,delta*W2[:,a_agent]*(x1>0))
W2[:,a_agent]=W2[:,a_agent]+eta*delta_W2
b2[a_agent]=b2[a_agent]+eta*delta
W1[:,a_agent]=W1[:,a_agent]+eta*delta_W1
b1=b1+eta*delta*W2[:,a_agent]*(x1>0)
R_save[n]=np.copy(R)
N_moves_save[n]=np.copy(i)
# THE CODE ENDS HERE
if checkmate:
break
elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
# King 2 has no freedom but it is not checked
draw = 1
R = 0.1
"""
FILL THE CODE
Update the parameters of your network by applying backpropagation and Q-learning. You need to use the
rectified linear function as activation function (see supplementary materials). Exploit the Q value for
the action made. You computed previously Q values in the Q_values function. Be careful: this is the last
iteration of the episode, it is a draw.
"""
delta=R-Q[a_agent]
delta_W2=eta*delta*x1
delta_W1=eta*np.outer(x,delta*W2[:,a_agent]*(x1>0))
W2[:,a_agent]=W2[:,a_agent]+eta*delta_W2
b2[a_agent]=b2[a_agent]+eta*delta
W1[:,a_agent]=W1[:,a_agent]+eta*delta_W1
b1=b1+eta*delta*W2[:,a_agent]*(x1>0)
R_save[n]=np.copy(R)
N_moves_save[n]=np.copy(i)
# YOUR CODE ENDS HERE
if draw:
break
else:
# Move enemy King randomly to a safe location
allowed_enemy_a = np.where(a_k2 > 0)[0]
a_help = int(np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
a_enemy = allowed_enemy_a[a_help]
direction = a_enemy
steps = 1
s[p_k2[0], p_k2[1]] = 0
mov = map[direction, :] * steps
s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3
p_k2[0] = p_k2[0] + mov[0]
p_k2[1] = p_k2[1] + mov[1]
# Update the parameters
# Possible actions of the King
dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
# Possible actions of the Queen
dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
# Possible actions of the enemy king
dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
# Compute features
x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
# Compute Q-values for the discounted factor
Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)
delta=R+gamma*np.max(Q_next)-Q[a_agent]
delta_W2=eta*delta*x1
delta_W1=eta*np.outer(x,delta*W2[:,a_agent]*(x1>0))
W2[:,a_agent]=W2[:,a_agent]+eta*delta_W2
b2[a_agent]=b2[a_agent]+eta*delta
W1[:,a_agent]=W1[:,a_agent]+eta*delta_W1
b1=b1+eta*delta*W2[:,a_agent]*(x1>0)
"""
FILL THE CODE
Update the parameters of your network by applying backpropagation and Q-learning. You need to use the
rectified linear function as activation function (see supplementary materials). Exploit the Q value for
the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last
iteration of the episode, the match continues.
"""
# YOUR CODE ENDS HERE
i += 1
if __name__ == '__main__':
main()