-
Notifications
You must be signed in to change notification settings - Fork 0
/
MCTSPlayerOnline.py
107 lines (84 loc) · 3.63 KB
/
MCTSPlayerOnline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import random
import datetime
from math import log, sqrt
import os
class MCTSPlayerOnline():
"""Monte Carlo Tree Search Player
Online only (no pre-training)
"""
def __init__(self, game, thinking_time = 1):
assert thinking_time > 0
self.game = game
self.thinking_time = thinking_time
self.max_moves = 200
self.C = 1.4 # parameter for exploration formula. Higher C means more exploration.
def get_action(self, state, legal_actions):
self.max_depth = 0
board = NoThanksBoard(self.n_players, self.config)
player = state[2][3]
if not legal_actions:
return
if len(legal_actions) == 1:
return legal_actions[0]
plays, wins = {}, {}
games = 0
calculation_delta = datetime.timedelta(seconds = self.thinking_time)
begin = datetime.datetime.utcnow()
while datetime.datetime.utcnow() - begin < calculation_delta:
board = NoThanksBoard(self.n_players, self.config)
plays, wins = self.run_simulation(state, board, plays, wins)
games += 1
percent_wins, chosen_action = max(
(100 * wins.get((player, state, action), 0) /
plays.get((player, state, action), 1),
action)
for action in legal_actions
)
return chosen_action
def run_simulation(self, state, board, plays, wins):
"""Run a single simulation of MCTS from state."""
visited_actions = set()
player = board.current_player(state)
phase = "selection" # "selection", "expansion", "end_expansion", "backpropagation"
for t in range(1, self.max_moves + 1):
legal_actions = board.legal_actions(state)
if all(plays.get((player, state, action)) for action in legal_actions):
# if we have stats on all of the legal moves, use them
log_total = log(
sum(plays[(player, state, action)] for action in legal_actions))
value, action = max(
((wins[(player, state, action)] / plays[(player, state, action)]) + self.C *
sqrt(log_total / plays[(player, state, action)]), action)
for action in legal_actions
)
else:
if phase == "selection":
phase = "expansion"
# otherwise, just pick a random one
action = random.choice(legal_actions)
# states_copy.append(state)
if phase == "expansion" and (player, state, action) not in plays:
plays[(player, state, action)] = 0
wins[(player, state, action)] = 0
if t > self.max_depth:
self.max_depth = t
phase = "end_expansion"
if phase == "selection" or "phase" == "expansion":
visited_actions.add((player, state, action))
elif phase == "end_expansion":
visited_actions.add((player, state, action))
phase = "simulation"
# move to next state
state = board.next_state(state, action)
player = board.current_player(state)
winner = board.winner(state)
if winner is not None:
break
phase = "backpropagation"
for player, state, action in visited_actions:
plays[(player, state, action)] += 1
if player == winner:
wins[(player, state, action)] += 1
return plays, wins
if __name__ == "__main__":
mcts_player = MCTSPlayerOnline()