-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathrun_grid_game.py
114 lines (96 loc) · 3.4 KB
/
run_grid_game.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
import matplotlib.pyplot as plt
from nash_q_learner import NashQLearner
from grid_game import GridGame
def run_episode(learning, max_steps, agents, game, is_plot=False):
for step in range(max_steps):
actions = {}
for agent in agents:
action = agent.act(training=learning)
actions[agent.id] = action
observations, rewards, is_terminal = game.step(actions)
agents[0].observe(
state=observations[0],
reward=rewards[0],
reward_o=rewards[1],
opponent_action=agents[1].prev_action,
learning=learning
)
agents[1].observe(
state=observations[1],
reward=rewards[1],
reward_o=rewards[0],
opponent_action=agents[0].prev_action,
learning=learning
)
if is_plot:
game.print_map()
if is_terminal:
break
average_rewards = []
if is_plot:
print(agents[0].reward_history)
print(agents[1].reward_history)
print(agents[0].action_history)
print(agents[1].action_history)
average_rewards.append(np.mean(agents[0].reward_history))
average_rewards.append(np.mean(agents[1].reward_history))
return step, average_rewards
if __name__ == '__main__':
nb_episode = 30000
max_steps = 10000
actions = np.arange(4)
game = GridGame(nb_agents=2)
ini_pos = game.create_observations()
agent1 = NashQLearner(
id=0,
epsilon=0.02,
ini_state=ini_pos[0],
actions=actions)
agent2 = NashQLearner(
id=1,
epsilon=0.02,
ini_state=ini_pos[1],
actions=actions)
step_history = []
reward_history = {"0": [], "1": []}
for episode in range(nb_episode):
observations = game.reset()
agent1.reset(state=observations[0])
agent2.reset(state=observations[1])
step, rewards = run_episode(
learning=True, max_steps=max_steps, agents=[
agent1, agent2], game=game)
if episode % 500 == 0:
# test
observations = game.reset(pos_list=((0, 1), (2, 1)))
agent1.reset(state=observations[0])
agent2.reset(state=observations[1])
is_plot = False
step, rewards = run_episode(
learning=False, max_steps=max_steps, agents=[
agent1, agent2], game=game, is_plot=is_plot)
step_history.append(step)
reward_history["0"].append(rewards[0])
reward_history["1"].append(rewards[1])
print("------------------------------------")
print(f"{episode},step:{step}, a0:{rewards[0]},a1:{rewards[1]}")
print("------------------------------------")
plt.figure(figsize=(12, 8))
plt.subplot(3, 1, 1)
plt.plot(np.arange(len(step_history)), step_history, label="step")
plt.legend()
plt.subplot(3, 1, 2)
reward_history["0"] = np.array(reward_history["0"])
reward_history["1"] = np.array(reward_history["1"])
plt.plot(np.arange(len(reward_history["0"])),
reward_history["0"], label="reward_history0")
plt.legend()
plt.ylim(-50, 30)
plt.subplot(3, 1, 3)
plt.plot(np.arange(len(reward_history["1"])),
reward_history["1"], label="reward_history1")
plt.ylim(-50, 30)
plt.legend()
plt.savefig("result.png")
plt.show()