Skip to content

Commit

Permalink
Created action A25 & A26 - adjustments with failures allowed
Browse files Browse the repository at this point in the history
  • Loading branch information
RodrigoCastroF committed Sep 7, 2024
1 parent 6a3aa4b commit aa49137
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 12 deletions.
9 changes: 9 additions & 0 deletions flowing_basin/rl_data/configs/action/A25.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"action_type": "adjustments",
"num_actions_block": 99,
"max_iterations": 10,
"terminate_on_failure": false,
"bonus_exceed_initial": 100.0,
"penalty_not_exceed_initial": 100.0,
"reward_relative_pct": true
}
8 changes: 8 additions & 0 deletions flowing_basin/rl_data/configs/action/A26.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"action_type": "adjustments",
"num_actions_block": 99,
"max_iterations": 10,
"terminate_on_failure": false,
"bonus_exceed_initial": 100.0,
"penalty_not_exceed_initial": 100.0
}
14 changes: 14 additions & 0 deletions flowing_basin/rl_data/configs/action/README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
Single-step continuous actions:
- `A0`: the agent gives relative variations of flow.
- `A1`: the agent gives the exiting flows directly.

Multi-step continuous actions:
- `A110`: the agent gives the exiting flows for the next 9 (99 // 11) timesteps.
- `A111`: the agent gives the exiting flows for the next 20 (~ 99 // 5) timesteps.
- `A112`: the agent gives the exiting flows for the next 33 (99 // 3) timesteps.
- `A113`: the agent gives the exiting flows for the next 99 (99 // 1) timesteps.

Adjustments actions:
- `A21`: the agent adjusts the actions of the solution (initially rl-greedy's) until it no longer improves.
- `A22`: like A21, but the initial greedy actions are taken with a greeediness of 80% instead of 100%
(which has similar performance and avoids clipping all positive adjustments).
- `A23`: like A22, but Gaussian noise with 0.15 standard deviation is added to the initial greedy actions.
- `A24`: like A21, but the initial actions are completely random (no greediness).
- `A25`: like A21, with these differences:
- The agent is allowed to worsen the solution, and the episode always ends after 10 adjustments.
- The agent is given a bonus of +100 when its solution finally exceeds rl-greedy's solution.
- The agent is given a penalty of -100 and the episode terminates without exceeding the initial rl-greedy solution.
- The reward is the _relative_ difference (in %) between the current and previous solutions
(e.g., the reward is +20 when the solution is improved by 20%).
- `A26`: like A25, but using the absolute difference instead of the relative difference.

Single-step discrete actions:
- `A31`: the agent can only choose among the given optimal flows.
- `A32`: the agent chooses the flow from the given number of discretization levels for each dam.
- `A33`: the agent chooses the turbine count and the flow level within this number of turbines.
Expand Down
6 changes: 3 additions & 3 deletions flowing_basin/scripts/rl_study_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

if __name__ == "__main__":

rl = ReinforcementLearning("rl-A1G0O1R1T12-3", verbose=3)
rl = ReinforcementLearning("rl-A25G0O3R1T3", verbose=3)
# rl.create_train_env()
# print("Configuration:", rl.config.to_dict())
# rl.check_train_env(obs_types=['raw'], initial_date='2020-08-19 00:00', seed=42) # instancePercentile50
rl.check_train_env(obs_types=['raw'], initial_date='2020-08-19 00:00', seed=42) # instancePercentile50
# rl.collect_obs()
# rl.train(save_agent=SAVE_AGENT)

Expand All @@ -28,7 +28,7 @@
# rl.plot_training_curve_agent(instances=["Percentile50"])
# rl.plot_training_curves_compare(["rl-A1G0O22R1T02", "rl-A1G0O221R1T02"], ["MILP"], values=["income", "acc_reward"])

print(rl.run_agent("Percentile50").solution.to_dict())
# print(rl.run_agent("Percentile50").solution.to_dict())
# start = time.perf_counter()
# print(rl.run_agent([f"Percentile{i*10:02}" for i in range(0, 11)]))
# print("Time:", time.perf_counter() - start)
Expand Down
8 changes: 7 additions & 1 deletion flowing_basin/solvers/rl/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,10 +244,16 @@ class ActionConfiguration(BaseProcessableConfiguration): # noqa
optimal_flow_values: dict[str, list[float]] = field(default_factory=lambda: dict())
discretization_levels: int = None
num_actions_block: int = 1 # By default, the agent only gives the actions for the current timestep
greediness: float = 1. # Greediness of the baseline greedy agent when action_type == 'adjustments'

# Configurations when action_type == 'adjustments'
greediness: float = 1. # Greediness of the baseline greedy agent
noise_std_dev: float = 0. # Standard deviation of the initial greedy actions
randomness: bool = False # If True, use random actions instead of greedy actions
max_iterations: int = 20 # Maximum number of consecutive adjustments
terminate_on_failure: bool = True # Terminate the episode when the total reward is lower than the previous solution
bonus_exceed_initial: float = 0. # Bonus for exceeding the initial solution (rl-greedy's)
penalty_not_exceed_initial: float = 0. # Penalty for NOT exceeding the initial solution when the episode ends
reward_relative_pct: bool = False # Use the relative improvement (%) of the solution instead of the absolute difference in total reward

def check(self):

Expand Down
33 changes: 25 additions & 8 deletions flowing_basin/solvers/rl/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,12 +802,10 @@ def is_done(self, update_as_flows: bool = False) -> bool:
# Stop when instance finishes
done = self.simulator_is_done()
else:
# Stop when solution gets worse or the maximum number of iterations is exceeded
assert len(self.total_rewards) >= 2, "There is no baseline for the current total reward."
done = (
self.total_rewards[-1] < self.total_rewards[-2] or
len(self.total_rewards) - 1 > self.config.max_iterations
)
done = len(self.total_rewards) - 1 > self.config.max_iterations
if self.config.terminate_on_failure:
done = done or self.total_rewards[-1] < self.total_rewards[-2]

return bool(done)

Expand Down Expand Up @@ -916,13 +914,32 @@ def step(
total_reward = sum(rewards)
else:
total_reward = None
if self.config.action_type != "adjustments" or update_as_flows or skip_rewards:

# Final reward; this is different from the total reward when action_type == 'adjustments'
not_using_adjustments = self.config.action_type != "adjustments" or update_as_flows or skip_rewards
if not_using_adjustments:
final_reward = total_reward
else:
# Difference with previous total reward
final_reward = total_reward - self.total_rewards[-1]
if self.config.reward_relative_pct:
# Relative difference (%) with previous total reward
if total_reward > 1 and self.total_rewards[-1] > 1:
final_reward = (total_reward - self.total_rewards[-1]) / self.total_rewards[-1] * 100
else:
# When the current or previous reward are negative or have a low value,
# the relative difference is meaningless. We use the absolute difference instead
final_reward = total_reward - self.total_rewards[-1]
else:
# Absolute difference with previous total reward
final_reward = total_reward - self.total_rewards[-1]
self.total_rewards.append(total_reward)

# Bonuses and penalties to the final reward when action_type == 'adjustments'
if not not_using_adjustments and self.is_done():
if max(self.total_rewards) > self.total_rewards[0]:
final_reward += self.config.bonus_exceed_initial
else:
final_reward -= self.config.penalty_not_exceed_initial

# Unclipped flows equivalent to the given actions
flows_block = np.array(flows_block).reshape(-1) # Give it the same shape as the original action array
self.last_flows_block = flows_block
Expand Down

0 comments on commit aa49137

Please sign in to comment.