Created action A25 & A26 - adjustments with failures allowed

baobabsoluciones · Sep 7, 2024 · aa49137 · aa49137
1 parent 6a3aa4b
commit aa49137
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 12 deletions.
diff --git a/flowing_basin/rl_data/configs/action/A25.json b/flowing_basin/rl_data/configs/action/A25.json
@@ -0,0 +1,9 @@
+{
+  "action_type": "adjustments",
+  "num_actions_block": 99,
+  "max_iterations": 10,
+  "terminate_on_failure": false,
+  "bonus_exceed_initial": 100.0,
+  "penalty_not_exceed_initial": 100.0,
+  "reward_relative_pct": true
+}
diff --git a/flowing_basin/rl_data/configs/action/A26.json b/flowing_basin/rl_data/configs/action/A26.json
@@ -0,0 +1,8 @@
+{
+  "action_type": "adjustments",
+  "num_actions_block": 99,
+  "max_iterations": 10,
+  "terminate_on_failure": false,
+  "bonus_exceed_initial": 100.0,
+  "penalty_not_exceed_initial": 100.0
+}
diff --git a/flowing_basin/rl_data/configs/action/README.md b/flowing_basin/rl_data/configs/action/README.md
@@ -1,14 +1,28 @@
+Single-step continuous actions:
 - `A0`: the agent gives relative variations of flow.
 - `A1`: the agent gives the exiting flows directly.
+
+Multi-step continuous actions:
 - `A110`: the agent gives the exiting flows for the next 9 (99 // 11) timesteps.
 - `A111`: the agent gives the exiting flows for the next 20 (~ 99 // 5) timesteps.
 - `A112`: the agent gives the exiting flows for the next 33 (99 // 3) timesteps.
 - `A113`: the agent gives the exiting flows for the next 99 (99 // 1) timesteps.
+
+Adjustments actions:
 - `A21`: the agent adjusts the actions of the solution (initially rl-greedy's) until it no longer improves.
 - `A22`: like A21, but the initial greedy actions are taken with a greeediness of 80% instead of 100%
 (which has similar performance and avoids clipping all positive adjustments).
 - `A23`: like A22, but Gaussian noise with 0.15 standard deviation is added to the initial greedy actions.
 - `A24`: like A21, but the initial actions are completely random (no greediness).
+- `A25`: like A21, with these differences:
+  - The agent is allowed to worsen the solution, and the episode always ends after 10 adjustments.
+  - The agent is given a bonus of +100 when its solution finally exceeds rl-greedy's solution.
+  - The agent is given a penalty of -100 and the episode terminates without exceeding the initial rl-greedy solution.
+  - The reward is the _relative_ difference (in %) between the current and previous solutions
+    (e.g., the reward is +20 when the solution is improved by 20%).
+- `A26`: like A25, but using the absolute difference instead of the relative difference.
+
+Single-step discrete actions:
 - `A31`: the agent can only choose among the given optimal flows.
 - `A32`: the agent chooses the flow from the given number of discretization levels for each dam.
 - `A33`: the agent chooses the turbine count and the flow level within this number of turbines.

diff --git a/flowing_basin/scripts/rl_study_env.py b/flowing_basin/scripts/rl_study_env.py
@@ -16,10 +16,10 @@
 
 if __name__ == "__main__":
 
-    rl = ReinforcementLearning("rl-A1G0O1R1T12-3", verbose=3)
+    rl = ReinforcementLearning("rl-A25G0O3R1T3", verbose=3)
     # rl.create_train_env()
     # print("Configuration:", rl.config.to_dict())
-    # rl.check_train_env(obs_types=['raw'], initial_date='2020-08-19 00:00', seed=42)  # instancePercentile50
+    rl.check_train_env(obs_types=['raw'], initial_date='2020-08-19 00:00', seed=42)  # instancePercentile50
     # rl.collect_obs()
     # rl.train(save_agent=SAVE_AGENT)
 
@@ -28,7 +28,7 @@
     # rl.plot_training_curve_agent(instances=["Percentile50"])
     # rl.plot_training_curves_compare(["rl-A1G0O22R1T02", "rl-A1G0O221R1T02"], ["MILP"], values=["income", "acc_reward"])
 
-    print(rl.run_agent("Percentile50").solution.to_dict())
+    # print(rl.run_agent("Percentile50").solution.to_dict())
     # start = time.perf_counter()
     # print(rl.run_agent([f"Percentile{i*10:02}" for i in range(0, 11)]))
     # print("Time:", time.perf_counter() - start)

diff --git a/flowing_basin/solvers/rl/config.py b/flowing_basin/solvers/rl/config.py
@@ -244,10 +244,16 @@ class ActionConfiguration(BaseProcessableConfiguration):  # noqa
     optimal_flow_values: dict[str, list[float]] = field(default_factory=lambda: dict())
     discretization_levels: int = None
     num_actions_block: int = 1  # By default, the agent only gives the actions for the current timestep
-    greediness: float = 1.  # Greediness of the baseline greedy agent when action_type == 'adjustments'
+
+    # Configurations when action_type == 'adjustments'
+    greediness: float = 1.  # Greediness of the baseline greedy agent
     noise_std_dev: float = 0.  # Standard deviation of the initial greedy actions
     randomness: bool = False  # If True, use random actions instead of greedy actions
     max_iterations: int = 20  # Maximum number of consecutive adjustments
+    terminate_on_failure: bool = True  # Terminate the episode when the total reward is lower than the previous solution
+    bonus_exceed_initial: float = 0.  # Bonus for exceeding the initial solution (rl-greedy's)
+    penalty_not_exceed_initial: float = 0.  # Penalty for NOT exceeding the initial solution when the episode ends
+    reward_relative_pct: bool = False  # Use the relative improvement (%) of the solution instead of the absolute difference in total reward
 
     def check(self):
 

diff --git a/flowing_basin/solvers/rl/env.py b/flowing_basin/solvers/rl/env.py
@@ -802,12 +802,10 @@ def is_done(self, update_as_flows: bool = False) -> bool:
             # Stop when instance finishes
             done = self.simulator_is_done()
         else:
-            # Stop when solution gets worse or the maximum number of iterations is exceeded
             assert len(self.total_rewards) >= 2, "There is no baseline for the current total reward."
-            done = (
-                self.total_rewards[-1] < self.total_rewards[-2] or
-                len(self.total_rewards) - 1 > self.config.max_iterations
-            )
+            done = len(self.total_rewards) - 1 > self.config.max_iterations
+            if self.config.terminate_on_failure:
+                done = done or self.total_rewards[-1] < self.total_rewards[-2]
 
         return bool(done)
 
@@ -916,13 +914,32 @@ def step(
             total_reward = sum(rewards)
         else:
             total_reward = None
-        if self.config.action_type != "adjustments" or update_as_flows or skip_rewards:
+
+        # Final reward; this is different from the total reward when action_type == 'adjustments'
+        not_using_adjustments = self.config.action_type != "adjustments" or update_as_flows or skip_rewards
+        if not_using_adjustments:
             final_reward = total_reward
         else:
-            # Difference with previous total reward
-            final_reward = total_reward - self.total_rewards[-1]
+            if self.config.reward_relative_pct:
+                # Relative difference (%) with previous total reward
+                if total_reward > 1 and self.total_rewards[-1] > 1:
+                    final_reward = (total_reward - self.total_rewards[-1]) / self.total_rewards[-1] * 100
+                else:
+                    # When the current or previous reward are negative or have a low value,
+                    # the relative difference is meaningless. We use the absolute difference instead
+                    final_reward = total_reward - self.total_rewards[-1]
+            else:
+                # Absolute difference with previous total reward
+                final_reward = total_reward - self.total_rewards[-1]
         self.total_rewards.append(total_reward)
 
+        # Bonuses and penalties to the final reward when action_type == 'adjustments'
+        if not not_using_adjustments and self.is_done():
+            if max(self.total_rewards) > self.total_rewards[0]:
+                final_reward += self.config.bonus_exceed_initial
+            else:
+                final_reward -= self.config.penalty_not_exceed_initial
+
         # Unclipped flows equivalent to the given actions
         flows_block = np.array(flows_block).reshape(-1)  # Give it the same shape as the original action array
         self.last_flows_block = flows_block