adding benchmark target; updating ui; bumping rlt

jonas-eschmann · jonas-eschmann · commit 04baa5731d1a · 2024-10-01T15:08:59.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,12 +3,14 @@
 #set(RL_TOOLS_BACKEND_ENABLE_ACCELERATE ON) # if you are on macOS (fastest on Apple Silicon)
 add_subdirectory(external/rl_tools)
 
-add_executable(my_pendulum
-    src/main.cpp
-)
-#target_compile_definitions(my_pendulum PRIVATE BENCHMARK)
+add_executable(my_pendulum src/main.cpp)
 target_link_libraries(my_pendulum PRIVATE RLtools::RLtools)
 
+# The following target disables evaluations and checkpointing during training to assess the training time
+add_executable(my_pendulum_benchmark src/main.cpp)
+target_compile_definitions(my_pendulum_benchmark PRIVATE BENCHMARK)
+target_link_libraries(my_pendulum_benchmark PRIVATE RLtools::RLtools)
+
 
 
 if(NOT MSVC AND CMAKE_BUILD_TYPE STREQUAL "Release")
diff --git a/external/rl_tools b/external/rl_tools
@@ -1 +1 @@
-Subproject commit 61208a2dedb41fb371b6e4609e596835ced8e78a
+Subproject commit 0ded36c23d59c7947737087ac8b1915546d05653
diff --git a/include/my_pendulum/operations_cpu.h b/include/my_pendulum/operations_cpu.h
@@ -1,22 +1,32 @@
+#include <string>
+
 namespace rl_tools{
-    template<typename DEVICE, typename SPEC>
-    std::string json(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters){
+    template <typename DEVICE, typename SPEC>
+    std::string json(DEVICE&, MyPendulum<SPEC>& env, typename MyPendulum<SPEC>::Parameters& parameters){
         return "{}";
     }
-
-    template<typename DEVICE, typename SPEC>
-    std::string json(DEVICE& device, const MyPendulum<SPEC>& env, const typename MyPendulum<SPEC>::Parameters& parameters, const typename MyPendulum<SPEC>::State& state){
+    template <typename DEVICE, typename SPEC>
+    std::string json(DEVICE&, MyPendulum<SPEC>& env, typename MyPendulum<SPEC>::Parameters& parameters, typename MyPendulum<SPEC>::State& state){
         std::string json = "{";
         json += "\"theta\":" + std::to_string(state.theta) + ",";
         json += "\"theta_dot\":" + std::to_string(state.theta_dot);
         json += "}";
         return json;
     }
+
     template <typename DEVICE, typename SPEC>
     std::string get_ui(DEVICE& device, MyPendulum<SPEC>& env){
-        // just the body of `function render(ctx, state, action) {` (so that it can be easily processed by `new Function("ctx", "state", "action", body)`
+        // Implement the functions `export async function render(ui_state, parameters, state, action)` and `export async function init(canvas, parameters, options)` and `export` them so that they are available as ES6 imports
         // Please have a look at https://studio.rl.tools which helps you create render functions interactively
         std::string ui = R"RL_TOOLS_LITERAL(
+export async function init(canvas, options){
+    // Simply saving the context for 2D environments
+    return {
+        ctx: canvas.getContext('2d')
+    }
+}
+export async function render(ui_state, parameters, state, action) {
+    const ctx = ui_state.ctx
     ctx.clearRect(0, 0, ctx.canvas.width, ctx.canvas.height);
 
     const centerX = ctx.canvas.width / 2;
@@ -88,8 +98,8 @@ namespace rl_tools{
     ctx.lineTo(arrowX, arrowY);
     ctx.fillStyle = 'black';
     ctx.fill();
+}
         )RL_TOOLS_LITERAL";
         return ui;
     }
-
-}
+}
diff --git a/src/main.cpp b/src/main.cpp
@@ -22,27 +22,32 @@ using TI = typename DEVICE::index_t;
 using PENDULUM_SPEC = MyPendulumSpecification<T, TI, MyPendulumParameters<T>>;
 using ENVIRONMENT = MyPendulum<PENDULUM_SPEC>;
 struct LOOP_CORE_PARAMETERS: rlt::rl::algorithms::ppo::loop::core::DefaultParameters<T, TI, ENVIRONMENT>{
-    struct PPO_PARAMETERS: rlt::rl::algorithms::ppo::DefaultParameters<T, TI>{
-        static constexpr T ACTION_ENTROPY_COEFFICIENT = 0.0;
-        static constexpr TI N_EPOCHS = 2;
-    };
-
+    static constexpr TI BATCH_SIZE = 256;
+    static constexpr TI ACTOR_HIDDEN_DIM = 64;
+    static constexpr TI CRITIC_HIDDEN_DIM = 64;
+    static constexpr TI ON_POLICY_RUNNER_STEPS_PER_ENV = 1024;
     static constexpr TI N_ENVIRONMENTS = 4;
-    static constexpr TI ON_POLICY_RUNNER_STEPS_PER_ENV = 256;
-    static constexpr TI BATCH_SIZE = 64;
     static constexpr TI TOTAL_STEP_LIMIT = 300000;
     static constexpr TI STEP_LIMIT = TOTAL_STEP_LIMIT/(ON_POLICY_RUNNER_STEPS_PER_ENV * N_ENVIRONMENTS) + 1;
     static constexpr TI EPISODE_STEP_LIMIT = 200;
+    using OPTIMIZER_PARAMETERS = rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_PYTORCH<T>;
+    struct PPO_PARAMETERS: rlt::rl::algorithms::ppo::DefaultParameters<T, TI, BATCH_SIZE>{
+        static constexpr T ACTION_ENTROPY_COEFFICIENT = 0.0;
+        static constexpr TI N_EPOCHS = 2;
+        static constexpr T GAMMA = 0.9;
+        static constexpr T INITIAL_ACTION_STD = 2.0;
+        static constexpr bool NORMALIZE_OBSERVATIONS = true;
+    };
 };
 using LOOP_CORE_CONFIG = rlt::rl::algorithms::ppo::loop::core::Config<T, TI, RNG, ENVIRONMENT, LOOP_CORE_PARAMETERS>;
+#ifndef BENCHMARK
+using LOOP_EXTRACK_CONFIG = rlt::rl::loop::steps::extrack::Config<LOOP_CORE_CONFIG>; // Sets up the experiment tracking structure (https://docs.rl.tools/10-Experiment%20Tracking.html)
 template <typename NEXT>
 struct LOOP_EVAL_PARAMETERS: rlt::rl::loop::steps::evaluation::Parameters<T, TI, NEXT>{
     static constexpr TI EVALUATION_INTERVAL = 4;
     static constexpr TI NUM_EVALUATION_EPISODES = 10;
     static constexpr TI N_EVALUATIONS = NEXT::CORE_PARAMETERS::STEP_LIMIT / EVALUATION_INTERVAL;
 };
-#ifndef BENCHMARK
-using LOOP_EXTRACK_CONFIG = rlt::rl::loop::steps::extrack::Config<LOOP_CORE_CONFIG>; // Sets up the experiment tracking structure (https://docs.rl.tools/10-Experiment%20Tracking.html)
 using LOOP_EVALUATION_CONFIG = rlt::rl::loop::steps::evaluation::Config<LOOP_EXTRACK_CONFIG, LOOP_EVAL_PARAMETERS<LOOP_EXTRACK_CONFIG>>; // Evaluates the policy in a fixed interval and logs the return
 struct LOOP_SAVE_TRAJECTORIES_PARAMETERS: rlt::rl::loop::steps::save_trajectories::Parameters<T, TI, LOOP_EVALUATION_CONFIG>{
     static constexpr TI INTERVAL_TEMP = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / 10;
@@ -84,5 +89,5 @@ int main(){
     }
     auto end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> diff = end_time-start_time;
-    std::cout << "Training time: " << diff.count() << std::endl;
+    std::cout << "Training time: " << diff.count() << " s" << std::endl;
 }