update

jonas-eschmann · jonas-eschmann · commit 85cd1f082c56 · 2024-11-18T19:50:59.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,3 +1,4 @@
+project(rl-tools-example)
 #set(RL_TOOLS_BACKEND_ENABLE_MKL ON) # if you have MKL installed (fastest on Intel)
 #set(RL_TOOLS_BACKEND_ENABLE_OPENBLAS ON) # if you have OpenBLAS installed
 #set(RL_TOOLS_BACKEND_ENABLE_ACCELERATE ON) # if you are on macOS (fastest on Apple Silicon)
diff --git a/README.MD b/README.MD
@@ -46,4 +46,4 @@ This example also includes the automatic experiment tracking available through t
 2. [State => JSON](https://github.com/rl-tools/example/blob/39acaa5b5402eacf5c2cab7b2e96db71f2ea110f/include/my_pendulum/operations_cpu.h#L8): Self-explanatory
 3. [UI Render function string](https://github.com/rl-tools/example/blob/39acaa5b5402eacf5c2cab7b2e96db71f2ea110f/include/my_pendulum/operations_cpu.h#L16): This function uses the HTML5 Canvas rendering API and can be easily created using [https://studio.rl.tools](https://studio.rl.tools). Nnote that due to the wide spread use of the HTML5 Canvas drawing API, also ChatGPT is really good at creating render functions for different environments if you give it an example like the ones provided on [https://studio.rl.tools](https://studio.rl.tools).
 
-The experiment tracking and save-trajectories step will periodically record trajectories and store them as `.json` files. After/while running the training you can run `./serve.sh` which should start a local webserver on [http://localhost:8080](http://localhost:8080) where you can see the recorded trajectories based on the render function you provided.
+The experiment tracking and save-trajectories step will periodically record trajectories and store them as `.json` files. After/while running the training you can run `python3 -m http.server` which should start a local webserver on [http://localhost:8080](http://localhost:8080) where you can see the recorded trajectories based on the render function you provided.
diff --git a/include/my_pendulum/my_pendulum.h b/include/my_pendulum/my_pendulum.h
@@ -43,4 +43,5 @@ struct MyPendulum: rl_tools::rl::environments::Environment<typename T_SPEC::T, t
     using ObservationPrivileged = Observation;
     static constexpr TI OBSERVATION_DIM = 3;
     static constexpr TI ACTION_DIM = 1;
+    static constexpr TI EPISODE_STEP_LIMIT = 200;
 };
diff --git a/src/main.cpp b/src/main.cpp
@@ -22,35 +22,41 @@ using TI = typename DEVICE::index_t;
 using PENDULUM_SPEC = MyPendulumSpecification<T, TI, MyPendulumParameters<T>>;
 using ENVIRONMENT = MyPendulum<PENDULUM_SPEC>;
 struct LOOP_CORE_PARAMETERS: rlt::rl::algorithms::ppo::loop::core::DefaultParameters<T, TI, ENVIRONMENT>{
-    static constexpr TI BATCH_SIZE = 256;
-    static constexpr TI ACTOR_HIDDEN_DIM = 64;
-    static constexpr TI CRITIC_HIDDEN_DIM = 64;
-    static constexpr TI ON_POLICY_RUNNER_STEPS_PER_ENV = 1024;
-    static constexpr TI N_ENVIRONMENTS = 4;
-    static constexpr TI TOTAL_STEP_LIMIT = 300000;
+
+    static constexpr TI N_ENVIRONMENTS = 8;
+    static constexpr TI ON_POLICY_RUNNER_STEPS_PER_ENV = 128;
+    static constexpr TI BATCH_SIZE = 128;
+    static constexpr TI TOTAL_STEP_LIMIT = 500000;
+    static constexpr TI ACTOR_HIDDEN_DIM = 32;
+    static constexpr TI CRITIC_HIDDEN_DIM = 32;
+    static constexpr auto ACTOR_ACTIVATION_FUNCTION = rlt::nn::activation_functions::ActivationFunction::FAST_TANH;
+    static constexpr auto CRITIC_ACTIVATION_FUNCTION = rlt::nn::activation_functions::ActivationFunction::FAST_TANH;
     static constexpr TI STEP_LIMIT = TOTAL_STEP_LIMIT/(ON_POLICY_RUNNER_STEPS_PER_ENV * N_ENVIRONMENTS) + 1;
-    static constexpr TI EPISODE_STEP_LIMIT = 200;
-    using OPTIMIZER_PARAMETERS = rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_PYTORCH<T>;
+    static constexpr TI EPISODE_STEP_LIMIT = ENVIRONMENT::EPISODE_STEP_LIMIT;
+    struct OPTIMIZER_PARAMETERS: rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_TENSORFLOW<T>{
+        static constexpr T ALPHA = 0.01;
+    };
+
     struct PPO_PARAMETERS: rlt::rl::algorithms::ppo::DefaultParameters<T, TI, BATCH_SIZE>{
         static constexpr T ACTION_ENTROPY_COEFFICIENT = 0.0;
-        static constexpr TI N_EPOCHS = 2;
+        static constexpr TI N_EPOCHS = 1;
+        static constexpr bool NORMALIZE_OBSERVATIONS = true;
         static constexpr T GAMMA = 0.9;
         static constexpr T INITIAL_ACTION_STD = 2.0;
-        static constexpr bool NORMALIZE_OBSERVATIONS = true;
     };
 };
 using LOOP_CORE_CONFIG = rlt::rl::algorithms::ppo::loop::core::Config<T, TI, RNG, ENVIRONMENT, LOOP_CORE_PARAMETERS>;
 #ifndef BENCHMARK
 using LOOP_EXTRACK_CONFIG = rlt::rl::loop::steps::extrack::Config<LOOP_CORE_CONFIG>; // Sets up the experiment tracking structure (https://docs.rl.tools/10-Experiment%20Tracking.html)
 template <typename NEXT>
 struct LOOP_EVAL_PARAMETERS: rlt::rl::loop::steps::evaluation::Parameters<T, TI, NEXT>{
-    static constexpr TI EVALUATION_INTERVAL = 4;
+    static constexpr TI EVALUATION_INTERVAL = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / 5;
     static constexpr TI NUM_EVALUATION_EPISODES = 10;
     static constexpr TI N_EVALUATIONS = NEXT::CORE_PARAMETERS::STEP_LIMIT / EVALUATION_INTERVAL;
 };
 using LOOP_EVALUATION_CONFIG = rlt::rl::loop::steps::evaluation::Config<LOOP_EXTRACK_CONFIG, LOOP_EVAL_PARAMETERS<LOOP_EXTRACK_CONFIG>>; // Evaluates the policy in a fixed interval and logs the return
 struct LOOP_SAVE_TRAJECTORIES_PARAMETERS: rlt::rl::loop::steps::save_trajectories::Parameters<T, TI, LOOP_EVALUATION_CONFIG>{
-    static constexpr TI INTERVAL_TEMP = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / 10;
+    static constexpr TI INTERVAL_TEMP = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / 3;
     static constexpr TI INTERVAL = INTERVAL_TEMP == 0 ? 1 : INTERVAL_TEMP;
     static constexpr TI NUM_EPISODES = 10;
 };
@@ -69,23 +75,21 @@ using LOOP_STATE = typename LOOP_CONFIG::template State<LOOP_CONFIG>;
 
 int main(){
     DEVICE device;
-    TI seed = 1337;
+    TI seed = 2;
     LOOP_STATE ls;
 #ifndef BENCHMARK
     // Set experiment tracking info
     ls.extrack_name = "example";
 #endif
     rlt::malloc(device, ls);
     rlt::init(device, ls, seed);
-    ls.actor_optimizer.parameters.alpha = 1e-2;
-    ls.critic_optimizer.parameters.alpha = 1e-2;
     auto start_time = std::chrono::high_resolution_clock::now();
     while(!rlt::step(device, ls)){
         // do what ever you want here, e.g. poor man's learning rate scheduler:
-        if(ls.step % 1 == 0){
-            ls.actor_optimizer.parameters.alpha *= 0.9;
-            ls.critic_optimizer.parameters.alpha *= 0.9;
-        }
+        // if(ls.step % 1 == 0){
+        //     ls.actor_optimizer.parameters.alpha *= 0.9;
+        //     ls.critic_optimizer.parameters.alpha *= 0.9;
+        // }
     }
     auto end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> diff = end_time-start_time;

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+project(rl-tools-example)`
`1`	`2`	`#set(RL_TOOLS_BACKEND_ENABLE_MKL ON) # if you have MKL installed (fastest on Intel)`
`2`	`3`	`#set(RL_TOOLS_BACKEND_ENABLE_OPENBLAS ON) # if you have OpenBLAS installed`
`3`	`4`	`#set(RL_TOOLS_BACKEND_ENABLE_ACCELERATE ON) # if you are on macOS (fastest on Apple Silicon)`