Skip to content

Commit 952a63a

Browse files
Update C51 algorithm hyperparameters and switch environment to LunarLander (#28)
* 更新C51算法的超参数,修改初始epsilon值和衰减策略,切换环境至LunarLander * Fix C51 * Adjust param
1 parent 982bc91 commit 952a63a

File tree

5 files changed

+36
-64
lines changed

5 files changed

+36
-64
lines changed

AWR/main.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def main(
5252
env = TrainMonitor(env, tensorboard_dir="./logs", tensorboard_write_all=True)
5353

5454
gamma = 0.995
55-
lr_actor = 0.001
56-
lr_critic = 0.001
55+
lr_actor = 5e-5
56+
lr_critic = 1e-4
5757
batch_size = 64
5858
beta = 0.01
5959

@@ -64,10 +64,11 @@ def main(
6464
learn_iteration = 10
6565
num_workers = 32
6666

67-
grad_clip = 0.5
67+
grad_clip = 100
6868
norm_factor = 10
6969
value_network_scale = True
7070
l2_loss_weight = 0.01
71+
memory_size = int(1e8)
7172

7273
agent = Agent(
7374
state_dims=env.observation_space.shape[0],
@@ -77,7 +78,7 @@ def main(
7778
gamma=gamma,
7879
batch_size=batch_size,
7980
forget_experience=False,
80-
mem_size=100000,
81+
mem_size=memory_size,
8182
beta=beta,
8283
td_lambda=td_lambda,
8384
awr_beta=awr_beta,

C51/C51Agent_100.gif

325 KB
Loading

C51/C51Agent_100_CarPole.gif

385 KB
Loading

C51/c51.py

Lines changed: 21 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ def __init__(
1515
self,
1616
state_dim,
1717
action_space,
18-
n_atoms,
18+
n_atoms=51,
1919
seed=0,
20-
hidden_size=None,
21-
init_weight_gain=np.sqrt(2),
22-
init_bias=0
20+
fc1_unit=128,
21+
fc2_unit=128,
22+
fc3_unit=128,
2323
):
2424
"""
2525
Initialize parameters and build model.
@@ -31,58 +31,26 @@ def __init__(
3131
fc1_unit (int): Number of nodes in first hidden layer
3232
fc2_unit (int): Number of nodes in second hidden layer
3333
"""
34-
super().__init__()
34+
super().__init__() ## calls __init__ method of nn.Module class
35+
self.seed = torch.manual_seed(seed)
3536
self.action_space = action_space
3637
self.n_atoms = n_atoms
37-
self.seed = torch.manual_seed(seed)
38-
self.hidden_size = (100, 100, 100) if not hidden_size else hidden_size
39-
self.bn = nn.BatchNorm1d(state_dim)
40-
41-
def init_weights(m):
42-
if isinstance(m, nn.Linear):
43-
nn.init.orthogonal_(m.weight, gain=init_weight_gain)
44-
nn.init.constant_(m.bias, init_bias)
45-
46-
# note: The self.hidden_layers attribute is defined as a list of lists,
47-
# note: but it should be a list of `nn.Sequential` objects.
48-
# note: You can fix this by using `nn.Sequential` to define each layer.
49-
# note: After using `nn.Sequential`, you need to define a list with
50-
# note: `nn.ModuleList` to construct the model graph.
51-
self.hidden_layers = nn.ModuleList([
52-
nn.Sequential(nn.Linear(in_size, out_size), nn.LeakyReLU())
53-
for in_size, out_size in zip((state_dim, ) +
54-
self.hidden_size, self.hidden_size)
55-
])
56-
self.hidden_layers.apply(init_weights)
57-
58-
def init_output_weights(m):
59-
if isinstance(m, nn.Linear):
38+
self.fc1 = nn.Linear(state_dim, fc1_unit)
39+
self.fc2 = nn.Linear(fc1_unit, fc2_unit)
40+
self.fc3 = nn.Linear(fc2_unit, fc3_unit)
41+
self.fc4 = nn.Linear(fc3_unit, action_space * n_atoms)
6042

61-
nn.init.orthogonal_(m.weight, gain=init_weight_gain)
62-
nn.init.constant_(m.bias, init_bias)
63-
64-
self.output_layers = nn.ModuleList([
65-
nn.Sequential(
66-
nn.Linear(self.hidden_size[-1], n_atoms), nn.LeakyReLU(),
67-
nn.Softmax(dim=-1)
68-
) for _ in range(action_space)
69-
])
70-
71-
self.output_layers.apply(init_output_weights)
72-
73-
def forward(self, state):
74-
x = self.bn(state)
75-
for hidden_layer in self.hidden_layers:
76-
x = hidden_layer(x)
77-
out = torch.concat([
78-
torch.unsqueeze(output_layer(x), dim=1)
79-
for output_layer in self.output_layers
80-
],
81-
dim=1)
82-
# x = self.output_layer(x)
83-
# x = torch.reshape(x, (-1, self.action_space, self.n_atoms))
84-
# x = F.softmax(x, dim=-1)
85-
return out
43+
def forward(self, x):
44+
"""
45+
Build a network that maps state -> action values.
46+
"""
47+
x = F.leaky_relu(self.fc1(x))
48+
x = F.leaky_relu(self.fc2(x))
49+
x = F.leaky_relu(self.fc3(x))
50+
x = self.fc4(x)
51+
x = torch.reshape(x, (-1, self.action_space, self.n_atoms))
52+
x = F.softmax(x, dim=-1)
53+
return x
8654

8755

8856
# device = torch.device("cpu")

C51/main.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""main executable file for Distribution Q learning."""
22
import os
3+
import math
34
import logging
45
from itertools import repeat
56
import gymnasium as gym
@@ -24,9 +25,9 @@
2425
def main(
2526
n_episodes=2000,
2627
max_t=500,
27-
eps_start=1,
28+
eps_start=0.9,
2829
eps_end=0.01,
29-
eps_decay=0.996,
30+
eps_decay=1000,
3031
score_term_rules=lambda s: False,
3132
time_interval="25ms"
3233
):
@@ -45,17 +46,17 @@ def main(
4546
scores = [] # list containing score from each episode
4647
scores_window = deque(maxlen=100) # last 100 scores
4748
eps = eps_start
48-
env = gym.make("CartPole-v1", render_mode="rgb_array")
49+
env = gym.make("LunarLander-v3", render_mode="rgb_array")
4950
env = TrainMonitor(env, tensorboard_dir="./logs", tensorboard_write_all=True)
5051

5152
gamma = 0.99
52-
lr = 0.0001
53+
lr = 1e-4
5354
batch_size = 64
5455
learn_iteration = 16
5556
update_q_target_freq = 4
5657
n_atoms = 51
57-
v_min = -20
58-
v_max = 20
58+
v_min = -100
59+
v_max = 100
5960

6061
agent = Agent(
6162
state_dims=env.observation_space.shape[0],
@@ -89,7 +90,9 @@ def main(
8990

9091
scores_window.append(score) ## save the most recent score
9192
scores.append(score) ## sae the most recent score
92-
eps = max(eps * eps_decay, eps_end) ## decrease the epsilon
93+
# eps = max(eps * eps_decay, eps_end) ## decrease the epsilon
94+
eps = eps_end + (eps_start - eps_end) * \
95+
math.exp(-1. * t * i_episode * learn_iteration / eps_decay)
9396
print(" " * os.get_terminal_size().columns, end="\r")
9497
print(
9598
f"\rEpisode {i_episode}\tAverage Score {np.mean(scores_window):.2f}",

0 commit comments

Comments
 (0)