Skip to content

Commit 881579e

Browse files
Refactor utility functions and update LunarLander version (#26)
* move gae into utils * move n returns to utils * move the standard etc into utils * move gumbel loss into utils * Rename the lunalancher to v3 * Added devcontainer * Fix the precommit * Change to values * Update the GitHub action Python version to 3.12
1 parent e3d50d4 commit 881579e

File tree

31 files changed

+238
-371
lines changed

31 files changed

+238
-371
lines changed

.devcontainer/devcontainer.json

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
2+
// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
3+
{
4+
"remoteUser": "root",
5+
"name": "tvm",
6+
"workspaceMount": "source=${localWorkspaceFolder},target=/root/Desktop/dockerVolumn/${localWorkspaceFolderBasename},type=bind",
7+
"workspaceFolder": "/root/Desktop/dockerVolumn/${localWorkspaceFolderBasename}",
8+
"image": "alwaysproblem/fastdev-u2204:conda-nv12.2.0",
9+
// Features to add to the dev container. More info: https://containers.dev/features.
10+
// "features": {},
11+
// Use 'forwardPorts' to make a list of ports inside the container available locally.
12+
// "forwardPorts": [],
13+
// Use 'postCreateCommand' to run commands after the container is created.
14+
"postCreateCommand": "bash init.sh",
15+
// Configure tool-specific properties.
16+
// "customizations": {},
17+
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
18+
// "remoteUser": "root"
19+
"privileged": true,
20+
// "capAdd": ["SYS_PTRACE"],
21+
"mounts": [
22+
{
23+
"source": "/home/lemon/Desktop/dockerVolumn",
24+
"target": "/root/Desktop/dockerVolumn",
25+
"type": "volume"
26+
}
27+
],
28+
"runArgs": [
29+
// "--cap-add=SYS_PTRACE",
30+
// "--security-opt",
31+
// "seccomp=unconfined",
32+
"--gpus=0",
33+
"--shm-size=4G",
34+
"memlock=-1:-1",
35+
"--ulimit",
36+
"--name",
37+
"yyx-rltorch",
38+
]
39+
}

.devcontainer/init.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
3+
setup_new_user 1000 1000
4+
git config --global --add safe.directory "*"
5+
6+
source /root/miniconda3/etc/profile.d/conda.sh
7+
8+
conda create -n rltorch \
9+
pytorch torchvision torchaudio \
10+
pytorch-cuda=12.1 gymnasium pyglet \
11+
pygame gymnasium-box2d colorama \
12+
pylint yapf tqdm 'tensorboardx>=2.5.0' \
13+
'tensorboard>2.0' pillow matplotlib scipy \
14+
seaborn ipykernel -c conda-forge -c pytorch -c nvidia

.github/workflows/pre-commit.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ jobs:
1212
- uses: actions/checkout@v3
1313
- uses: actions/setup-python@v3
1414
with:
15-
python-version: '3.11'
15+
python-version: '3.12'
1616
- uses: pre-commit/action@v3.0.0

.pre-commit-config.yaml

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,26 @@
11
repos:
22
- repo: https://github.com/pre-commit/pre-commit-hooks
3-
rev: v4.4.0
3+
rev: v5.0.0
44
hooks:
55
- id: check-yaml
66
- id: end-of-file-fixer
77
- id: trailing-whitespace
88
- id: end-of-file-fixer
99

1010
- repo: https://github.com/pycqa/pylint
11-
rev: v2.16.2
11+
rev: v3.3.1
1212
hooks:
1313
- id: pylint
1414
args:
1515
- "--rcfile=.pylintrc"
1616
exclude: tests(/\w*)*/
1717

1818
- repo: https://github.com/google/yapf
19-
rev: v0.40.1
19+
rev: v0.40.2
2020
hooks:
2121
- id: yapf
2222

2323
- repo: https://github.com/pre-commit/mirrors-mypy
24-
rev: v1.0.0
24+
rev: v1.13.0
2525
hooks:
2626
- id: mypy
27-
28-
- repo: https://github.com/pre-commit/mirrors-clang-format
29-
rev: 'v15.0.7'
30-
hooks:
31-
- id: clang-format
32-
types_or: [c++, c]
33-
34-
- repo: https://github.com/mwouts/jupytext
35-
rev: v1.14.4
36-
hooks:
37-
- id: jupytext
38-
args: [--sync]

.pylintrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ disable=abstract-method,
146146
wrong-import-order,
147147
xrange-builtin,
148148
zip-builtin-not-iterating,
149+
too-many-positional-arguments
149150

150151

151152
[REPORTS]

AC/a2c.py

Lines changed: 10 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,7 @@
77

88
from util.agent import Agent
99
from util.buffer import ReplayBuffer, Trajectory
10-
11-
12-
def standardize(v):
13-
"""Method to standardize a rank-1 np array."""
14-
assert len(v) > 1, "Cannot standardize vector of size 1"
15-
v_std = (v - v.mean()) / (v.std() + 1e-08)
16-
return v_std
10+
from util.algo import calc_gaes, calc_nstep_return, standardize
1711

1812

1913
class Actor(nn.Module):
@@ -211,39 +205,17 @@ def calc_nstep_advs_v_target(self, states, rewards, next_states, terminates):
211205
with torch.no_grad():
212206
next_v_pred = self.critic.forward(next_states)
213207
v_preds = self.critic.forward(states).detach()
214-
n_steps_rets = self.calc_nstep_return(
215-
rewards=rewards, dones=terminates, next_v_pred=next_v_pred
208+
n_steps_rets = calc_nstep_return(
209+
rewards=rewards,
210+
dones=terminates,
211+
next_v_pred=next_v_pred,
212+
gamma=self.gamma,
213+
n_steps=self.n_steps
216214
)
217215
advs = n_steps_rets - v_preds
218216
v_targets = n_steps_rets
219217
return standardize(advs), v_targets
220218

221-
def calc_nstep_return(self, rewards, dones, next_v_pred):
222-
T = len(rewards) #pylint: disable=invalid-name
223-
rets = torch.zeros_like(rewards).to(device)
224-
_ = 1 - dones
225-
226-
for i in range(T):
227-
# we generate the vector like `gamma = [[γ⁰, γ¹, γ² ...γⁿ]]`
228-
# and gamma x reward (vector) to obtain the value for each timestamp.
229-
# There are a few items to make it to N
230-
# and we will take account all the items.
231-
rets[i] = torch.unsqueeze(
232-
self.gamma ** torch.arange(len(rewards[i:min(self.n_steps + i, T)])
233-
).to(device),
234-
dim=0
235-
) @ rewards[i:min(self.n_steps + i, T)]
236-
237-
if T > self.n_steps:
238-
# [[γ⁰, γ¹, γ² ...γⁿ]] x reward.T + γⁿ⁺¹ * V(sₜ₊ₙ₊₁)
239-
value_n_steps = self.gamma ** self.n_steps * next_v_pred[self.n_steps:]
240-
rets = torch.cat([
241-
value_n_steps,
242-
torch.zeros(size=(self.n_steps, 1)).to(device)
243-
]) + rets
244-
245-
return rets
246-
247219
def calc_gae_advs_v_target(self, states, rewards, next_states, terminates):
248220
"""calculate the GAE (Generalized Advantage Estimation) and V_target.
249221
@@ -264,23 +236,12 @@ def calc_gae_advs_v_target(self, states, rewards, next_states, terminates):
264236
next_v_pred = self.critic.forward(next_states[-1])
265237
v_preds = self.critic.forward(states).detach()
266238
v_preds_all = torch.concat((v_preds, next_v_pred.unsqueeze(0)), dim=0)
267-
advs = self.calc_gaes(rewards, terminates, v_preds_all)
239+
advs = calc_gaes(
240+
rewards, terminates, v_preds_all, self.gamma, self.gae_lambda
241+
)
268242
v_target = advs + v_preds
269243
return standardize(advs), v_target
270244

271-
def calc_gaes(self, rewards, dones, v_preds):
272-
# GAE = ∑ₗ (γλ)ˡδₜ₊ₗ
273-
# δₜ₊ₗ = rₜ + γV(sₜ₊₁) − V(sₜ)
274-
T = len(rewards) # pylint: disable=invalid-name
275-
gaes = torch.zeros_like(rewards, device=device)
276-
future_gae = torch.tensor(0.0, dtype=rewards.dtype, device=device)
277-
not_dones = 1 - dones # to reset at episode boundary by multiplying 0
278-
deltas = rewards + self.gamma * v_preds[1:] * not_dones - v_preds[:-1]
279-
coef = self.gamma * self.gae_lambda
280-
for t in reversed(range(T)):
281-
gaes[t] = future_gae = deltas[t] + coef * not_dones[t] * future_gae
282-
return gaes
283-
284245
def action(self, state, mode="eval"):
285246
if mode == "train":
286247
self.actor.train()

AC/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def main(
4545
scores = [] # list containing score from each episode
4646
scores_window = deque(maxlen=100) # last 100 scores
4747
eps = eps_start
48-
env = gym.make("LunarLander-v2", render_mode="rgb_array")
48+
env = gym.make("LunarLander-v3", render_mode="rgb_array")
4949
# env = gym.make("CartPole-v1", render_mode="rgb_array")
5050

5151
env = TrainMonitor(env, tensorboard_dir="./logs", tensorboard_write_all=True)

AWR/awr.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,7 @@
88

99
from util.agent import Agent
1010
from util.buffer import ReplayBuffer, Trajectory
11-
12-
NORMEPS = 1e-8
13-
14-
15-
def standardize(v):
16-
"""Method to standardize a rank-1 np array."""
17-
assert len(v) > 1, "Cannot standardize vector of size 1"
18-
v_std = (v - v.mean()) / (v.std() + NORMEPS)
19-
return v_std
20-
21-
22-
def scale_up_values(v, mean=0, std=1, norm_factor=1):
23-
return v / norm_factor * std + mean
24-
25-
26-
def scale_down_values(v, mean=0, std=1, norm_factor=1):
27-
return norm_factor * (v - mean) / (std + NORMEPS)
11+
from util.algo import standardize, scale_down_values, scale_up_values
2812

2913

3014
class Actor(nn.Module):

AWR/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def main(
4545
scores = [] # list containing score from each episode
4646
scores_window = deque(maxlen=100) # last 100 scores
4747
eps = eps_start
48-
env = gym.make("LunarLander-v2", render_mode="rgb_array")
48+
env = gym.make("LunarLander-v3", render_mode="rgb_array")
4949
# env = gym.make("CartPole-v1", render_mode="rgb_array")
5050
# max_t = 200
5151

DDPG/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def main(
5252
render_mode="rgb_array",
5353
)
5454
# env = gym.make(
55-
# "LunarLander-v2",
55+
# "LunarLander-v3",
5656
# render_mode="rgb_array",
5757
# continuous=True,
5858
# )

DQN/dqn.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ def __init__(
8181
self.qnetwork_target = Q(state_dim=state_dims,
8282
action_space=action_space).to(device)
8383
self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=lr)
84+
# self.optimizer = torch.optim.AdamW(
85+
# self.qnetwork_local.parameters(), lr=lr, amsgrad=True)
8486

8587
# Replay memory
8688
self.memory = ProportionalPrioritizedReplayBuffer(max_size=mem_size)
@@ -164,7 +166,7 @@ def _learn(self, experiences):
164166
self.qnetwork_target.forward(next_states).detach(),
165167
dim=1,
166168
keepdim=True
167-
)[0]
169+
).values
168170

169171
self.memory.update(torch.abs(predicted_targets - labels).squeeze().tolist())
170172

@@ -179,6 +181,7 @@ def _learn(self, experiences):
179181
# loss = self.loss(predicted_targets, labels)
180182
self.optimizer.zero_grad()
181183
loss.backward()
184+
# torch.nn.utils.clip_grad_value_(self.qnetwork_local.parameters(), 100)
182185
self.optimizer.step()
183186

184187
def update_targe_q(self):

MPO/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def main(
4646
scores_window = deque(maxlen=100) # last 100 scores
4747
eps = eps_start
4848
# env = gym.make("CartPole-v1", render_mode="rgb_array")
49-
env = gym.make("LunarLander-v2", render_mode="rgb_array")
49+
env = gym.make("LunarLander-v3", render_mode="rgb_array")
5050

5151
env = TrainMonitor(env, tensorboard_dir="./logs", tensorboard_write_all=True)
5252

MPO/mpo.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,6 @@
99
from util.agent import Agent
1010
from util.buffer import ReplayBuffer, Trajectory
1111

12-
13-
def standardize(v):
14-
"""Method to standardize a rank-1 np array."""
15-
assert len(v) > 1, "Cannot standardize vector of size 1"
16-
v_std = (v - v.mean()) / (v.std() + 1e-08)
17-
return v_std
18-
19-
2012
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
2113

2214

PPG/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def main(
4646
scores_window = deque(maxlen=100) # last 100 scores
4747
eps = eps_start
4848
# env = gym.make("CartPole-v1", render_mode="rgb_array")
49-
env = gym.make("LunarLander-v2", render_mode="rgb_array")
49+
env = gym.make("LunarLander-v3", render_mode="rgb_array")
5050

5151
env = TrainMonitor(env, tensorboard_dir="./logs", tensorboard_write_all=True)
5252

PPG/ppg.py

Lines changed: 10 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,7 @@
77

88
from util.agent import Agent
99
from util.buffer import ReplayBuffer, Trajectory, Experience
10-
11-
12-
def standardize(v):
13-
"""Method to standardize a rank-1 np array."""
14-
assert len(v) > 1, "Cannot standardize vector of size 1"
15-
v_std = (v - v.mean()) / (v.std() + 1e-08)
16-
return v_std
10+
from util.algo import calc_gaes, calc_nstep_return, standardize
1711

1812

1913
class Actor(nn.Module):
@@ -367,34 +361,17 @@ def calc_nstep_advs_v_target(self, states, rewards, next_states, terminates):
367361
with torch.no_grad():
368362
next_v_pred = self.critic.forward(next_states)
369363
v_preds = self.critic.forward(states).detach()
370-
n_steps_rets = self.calc_nstep_return(
371-
rewards=rewards, dones=terminates, next_v_pred=next_v_pred
364+
n_steps_rets = calc_nstep_return(
365+
rewards=rewards,
366+
dones=terminates,
367+
next_v_pred=next_v_pred,
368+
gamma=self.gamma,
369+
n_steps=self.n_steps
372370
)
373371
advs = n_steps_rets - v_preds
374372
v_targets = n_steps_rets
375373
return standardize(advs), v_targets
376374

377-
def calc_nstep_return(self, rewards, dones, next_v_pred):
378-
T = len(rewards) #pylint: disable=invalid-name
379-
rets = torch.zeros_like(rewards).to(device)
380-
_ = 1 - dones
381-
382-
for i in range(T):
383-
rets[i] = torch.unsqueeze(
384-
self.gamma ** torch.arange(len(rewards[i:min(self.n_steps + i, T)])
385-
).to(device),
386-
dim=0
387-
) @ rewards[i:min(self.n_steps + i, T)]
388-
389-
if T > self.n_steps:
390-
value_n_steps = self.gamma ** self.n_steps * next_v_pred[self.n_steps:]
391-
rets = torch.cat([
392-
value_n_steps,
393-
torch.zeros(size=(self.n_steps, 1)).to(device)
394-
]) + rets
395-
396-
return rets
397-
398375
def calc_gae_advs_v_target(self, states, rewards, next_states, terminates):
399376
"""calculate the GAE (Generalized Advantage Estimation) and V_target.
400377
@@ -415,21 +392,12 @@ def calc_gae_advs_v_target(self, states, rewards, next_states, terminates):
415392
next_v_pred = self.critic.forward(next_states[-1])
416393
v_preds = self.critic.forward(states).detach()
417394
v_preds_all = torch.concat((v_preds, next_v_pred.unsqueeze(0)), dim=0)
418-
advs = self.calc_gaes(rewards, terminates, v_preds_all)
395+
advs = calc_gaes(
396+
rewards, terminates, v_preds_all, self.gamma, self.gae_lambda
397+
)
419398
v_target = advs + v_preds
420399
return standardize(advs), v_target
421400

422-
def calc_gaes(self, rewards, dones, v_preds):
423-
T = len(rewards) # pylint: disable=invalid-name
424-
gaes = torch.zeros_like(rewards, device=device)
425-
future_gae = torch.tensor(0.0, dtype=rewards.dtype, device=device)
426-
not_dones = 1 - dones # to reset at episode boundary by multiplying 0
427-
deltas = rewards + self.gamma * v_preds[1:] * not_dones - v_preds[:-1]
428-
coef = self.gamma * self.gae_lambda
429-
for t in reversed(range(T)):
430-
gaes[t] = future_gae = deltas[t] + coef * not_dones[t] * future_gae
431-
return gaes
432-
433401
def action(self, state, mode="eval"):
434402
if mode == "train":
435403
self.actor.train()

0 commit comments

Comments
 (0)