Alwaysproblem
diff --git a/‎AWR/main.py
Lines changed: 5 additions & 4 deletions b/‎AWR/main.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎C51/C51Agent_100.gif
325 KB b/‎C51/C51Agent_100.gif
325 KB
diff --git a/‎C51/C51Agent_100_CarPole.gif
385 KB b/‎C51/C51Agent_100_CarPole.gif
385 KB
diff --git a/‎C51/c51.py
Lines changed: 21 additions & 53 deletions b/‎C51/c51.py
Lines changed: 21 additions & 53 deletions
diff --git a/‎C51/main.py
Lines changed: 10 additions & 7 deletions b/‎C51/main.py
Lines changed: 10 additions & 7 deletions
@@ -52,8 +52,8 @@ def main(
   env = TrainMonitor(env, tensorboard_dir="./logs", tensorboard_write_all=True)
 
   gamma = 0.995
-  lr_actor = 0.001
-  lr_critic = 0.001
+  lr_actor = 5e-5
+  lr_critic = 1e-4
   batch_size = 64
   beta = 0.01
 
@@ -64,10 +64,11 @@ def main(
   learn_iteration = 10
   num_workers = 32
 
-  grad_clip = 0.5
+  grad_clip = 100
   norm_factor = 10
   value_network_scale = True
   l2_loss_weight = 0.01
+  memory_size = int(1e8)
 
   agent = Agent(
       state_dims=env.observation_space.shape[0],
@@ -77,7 +78,7 @@ def main(
       gamma=gamma,
       batch_size=batch_size,
       forget_experience=False,
-      mem_size=100000,
+      mem_size=memory_size,
       beta=beta,
       td_lambda=td_lambda,
       awr_beta=awr_beta,
 
@@ -15,11 +15,11 @@ def __init__(
       self,
       state_dim,
       action_space,
-      n_atoms,
+      n_atoms=51,
       seed=0,
-      hidden_size=None,
-      init_weight_gain=np.sqrt(2),
-      init_bias=0
+      fc1_unit=128,
+      fc2_unit=128,
+      fc3_unit=128,
   ):
     """
         Initialize parameters and build model.
@@ -31,58 +31,26 @@ def __init__(
             fc1_unit (int): Number of nodes in first hidden layer
             fc2_unit (int): Number of nodes in second hidden layer
         """
-    super().__init__()
+    super().__init__()  ## calls __init__ method of nn.Module class
+    self.seed = torch.manual_seed(seed)
     self.action_space = action_space
     self.n_atoms = n_atoms
-    self.seed = torch.manual_seed(seed)
-    self.hidden_size = (100, 100, 100) if not hidden_size else hidden_size
-    self.bn = nn.BatchNorm1d(state_dim)
-
-    def init_weights(m):
-      if isinstance(m, nn.Linear):
-        nn.init.orthogonal_(m.weight, gain=init_weight_gain)
-        nn.init.constant_(m.bias, init_bias)
-
-    # note:  The self.hidden_layers attribute is defined as a list of lists,
-    # note:  but it should be a list of `nn.Sequential` objects.
-    # note:  You can fix this by using `nn.Sequential` to define each layer.
-    # note:  After using `nn.Sequential`, you need to define a list with
-    # note:  `nn.ModuleList` to construct the model graph.
-    self.hidden_layers = nn.ModuleList([
-        nn.Sequential(nn.Linear(in_size, out_size), nn.LeakyReLU())
-        for in_size, out_size in zip((state_dim, ) +
-                                     self.hidden_size, self.hidden_size)
-    ])
-    self.hidden_layers.apply(init_weights)
-
-    def init_output_weights(m):
-      if isinstance(m, nn.Linear):
+    self.fc1 = nn.Linear(state_dim, fc1_unit)
+    self.fc2 = nn.Linear(fc1_unit, fc2_unit)
+    self.fc3 = nn.Linear(fc2_unit, fc3_unit)
+    self.fc4 = nn.Linear(fc3_unit, action_space * n_atoms)
 
-        nn.init.orthogonal_(m.weight, gain=init_weight_gain)
-        nn.init.constant_(m.bias, init_bias)
-
-    self.output_layers = nn.ModuleList([
-        nn.Sequential(
-            nn.Linear(self.hidden_size[-1], n_atoms), nn.LeakyReLU(),
-            nn.Softmax(dim=-1)
-        ) for _ in range(action_space)
-    ])
-
-    self.output_layers.apply(init_output_weights)
-
-  def forward(self, state):
-    x = self.bn(state)
-    for hidden_layer in self.hidden_layers:
-      x = hidden_layer(x)
-    out = torch.concat([
-        torch.unsqueeze(output_layer(x), dim=1)
-        for output_layer in self.output_layers
-    ],
-                       dim=1)
-    # x = self.output_layer(x)
-    # x = torch.reshape(x, (-1, self.action_space, self.n_atoms))
-    # x = F.softmax(x, dim=-1)
-    return out
+  def forward(self, x):
+    """
+        Build a network that maps state -> action values.
+        """
+    x = F.leaky_relu(self.fc1(x))
+    x = F.leaky_relu(self.fc2(x))
+    x = F.leaky_relu(self.fc3(x))
+    x = self.fc4(x)
+    x = torch.reshape(x, (-1, self.action_space, self.n_atoms))
+    x = F.softmax(x, dim=-1)
+    return x
 
 
 # device = torch.device("cpu")
 
@@ -1,5 +1,6 @@
 """main executable file for Distribution Q learning."""
 import os
+import math
 import logging
 from itertools import repeat
 import gymnasium as gym
@@ -24,9 +25,9 @@
 def main(
     n_episodes=2000,
     max_t=500,
-    eps_start=1,
+    eps_start=0.9,
     eps_end=0.01,
-    eps_decay=0.996,
+    eps_decay=1000,
     score_term_rules=lambda s: False,
     time_interval="25ms"
 ):
@@ -45,17 +46,17 @@ def main(
   scores = []  # list containing score from each episode
   scores_window = deque(maxlen=100)  # last 100 scores
   eps = eps_start
-  env = gym.make("CartPole-v1", render_mode="rgb_array")
+  env = gym.make("LunarLander-v3", render_mode="rgb_array")
   env = TrainMonitor(env, tensorboard_dir="./logs", tensorboard_write_all=True)
 
   gamma = 0.99
-  lr = 0.0001
+  lr = 1e-4
   batch_size = 64
   learn_iteration = 16
   update_q_target_freq = 4
   n_atoms = 51
-  v_min = -20
-  v_max = 20
+  v_min = -100
+  v_max = 100
 
   agent = Agent(
       state_dims=env.observation_space.shape[0],
@@ -89,7 +90,9 @@ def main(
 
       scores_window.append(score)  ## save the most recent score
       scores.append(score)  ## sae the most recent score
-      eps = max(eps * eps_decay, eps_end)  ## decrease the epsilon
+      # eps = max(eps * eps_decay, eps_end)  ## decrease the epsilon
+      eps = eps_end + (eps_start - eps_end) * \
+        math.exp(-1. * t * i_episode * learn_iteration / eps_decay)
       print(" " * os.get_terminal_size().columns, end="\r")
       print(
           f"\rEpisode {i_episode}\tAverage Score {np.mean(scores_window):.2f}",