bump & good params

jonas-eschmann · jonas-eschmann · commit dc81159bfc40 · 2024-11-18T20:38:07.000-05:00
diff --git a/external/rl_tools b/external/rl_tools
@@ -1 +1 @@
-Subproject commit f64aca16ba287d557c4eda2bb22b6751e3b844aa
+Subproject commit 41909d59d4648bf8f58de4145dcc4619f69a5c9b
diff --git a/src/main.cpp b/src/main.cpp
@@ -22,25 +22,23 @@ using TI = typename DEVICE::index_t;
 using PENDULUM_SPEC = MyPendulumSpecification<T, TI, MyPendulumParameters<T>>;
 using ENVIRONMENT = MyPendulum<PENDULUM_SPEC>;
 struct LOOP_CORE_PARAMETERS: rlt::rl::algorithms::ppo::loop::core::DefaultParameters<T, TI, ENVIRONMENT>{
-
     static constexpr TI N_ENVIRONMENTS = 8;
     static constexpr TI ON_POLICY_RUNNER_STEPS_PER_ENV = 128;
     static constexpr TI BATCH_SIZE = 128;
-    static constexpr TI TOTAL_STEP_LIMIT = 500000;
+    static constexpr TI TOTAL_STEP_LIMIT = 1000000;
     static constexpr TI ACTOR_HIDDEN_DIM = 32;
     static constexpr TI CRITIC_HIDDEN_DIM = 32;
     static constexpr auto ACTOR_ACTIVATION_FUNCTION = rlt::nn::activation_functions::ActivationFunction::FAST_TANH;
     static constexpr auto CRITIC_ACTIVATION_FUNCTION = rlt::nn::activation_functions::ActivationFunction::FAST_TANH;
     static constexpr TI STEP_LIMIT = TOTAL_STEP_LIMIT/(ON_POLICY_RUNNER_STEPS_PER_ENV * N_ENVIRONMENTS) + 1;
     static constexpr TI EPISODE_STEP_LIMIT = ENVIRONMENT::EPISODE_STEP_LIMIT;
     struct OPTIMIZER_PARAMETERS: rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_TENSORFLOW<T>{
-        static constexpr T ALPHA = 0.01;
+        static constexpr T ALPHA = 0.001;
     };
-
+    static constexpr bool NORMALIZE_OBSERVATIONS = true;
     struct PPO_PARAMETERS: rlt::rl::algorithms::ppo::DefaultParameters<T, TI, BATCH_SIZE>{
         static constexpr T ACTION_ENTROPY_COEFFICIENT = 0.0;
         static constexpr TI N_EPOCHS = 1;
-        static constexpr bool NORMALIZE_OBSERVATIONS = true;
         static constexpr T GAMMA = 0.9;
         static constexpr T INITIAL_ACTION_STD = 2.0;
     };
@@ -50,7 +48,7 @@ using LOOP_CORE_CONFIG = rlt::rl::algorithms::ppo::loop::core::Config<T, TI, RNG
 using LOOP_EXTRACK_CONFIG = rlt::rl::loop::steps::extrack::Config<LOOP_CORE_CONFIG>; // Sets up the experiment tracking structure (https://docs.rl.tools/10-Experiment%20Tracking.html)
 template <typename NEXT>
 struct LOOP_EVAL_PARAMETERS: rlt::rl::loop::steps::evaluation::Parameters<T, TI, NEXT>{
-    static constexpr TI EVALUATION_INTERVAL = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / 5;
+    static constexpr TI EVALUATION_INTERVAL = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / 10;
     static constexpr TI NUM_EVALUATION_EPISODES = 10;
     static constexpr TI N_EVALUATIONS = NEXT::CORE_PARAMETERS::STEP_LIMIT / EVALUATION_INTERVAL;
 };
@@ -75,7 +73,7 @@ using LOOP_STATE = typename LOOP_CONFIG::template State<LOOP_CONFIG>;
 
 int main(){
     DEVICE device;
-    TI seed = 2;
+    TI seed = 0;
     LOOP_STATE ls;
 #ifndef BENCHMARK
     // Set experiment tracking info