dev: allow stateful policy

BillHuang2001 · BillHuang2001 · commit 383a2b17d665 · 2024-09-09T14:58:23.000+08:00
diff --git a/src/evox/problems/neuroevolution/reinforcement_learning/brax.py b/src/evox/problems/neuroevolution/reinforcement_learning/brax.py
@@ -20,6 +20,8 @@ def __init__(
         env_name: str,
         max_episode_length: int,
         num_episodes: int,
+        stateful_policy: bool = False,
+        initial_state: Any = None,
         reduce_fn: Callable[[jax.Array, int], jax.Array] = jnp.mean,
         backend: str = "generalized",
     ):
@@ -38,6 +40,18 @@ def __init__(
             The maximum number of timesteps of an episode.
         num_episodes
             Evaluating the number of episodes for each individual.
+        stateful_policy
+            Whether the policy is stateful (for example, RNN).
+            Default to False.
+            If False, the policy should be a pure function with signature (weights, obs) -> action.
+            If True, the policy should be a stateful function with signature (state, weights, obs) -> (action, state).
+        initial_state
+            The initial state of the stateful policy.
+            Default to None.
+            Only used when stateful_policy is True.
+        reduce_fn
+            The function to reduce the rewards of multiple episodes.
+            Default to jnp.mean.
         backend
             Brax's backend, one of "generalized", "positional", "spring".
             Default to "generalized".
@@ -49,6 +63,8 @@ def __init__(
         self.env = envs.wrappers.training.VmapWrapper(
             envs.get_environment(env_name=env_name, backend=backend)
         )
+        self.stateful_policy = stateful_policy
+        self.initial_state = initial_state
         self.max_episode_length = max_episode_length
         self.num_episodes = num_episodes
         self.reduce_fn = reduce_fn
@@ -65,28 +81,40 @@ def evaluate(self, state, weights):
         key, eval_key = jax.random.split(state.key)
 
         def _cond_func(carry):
-            counter, state, done, _total_reward = carry
+            counter, _state, done, _total_reward = carry
             return (counter < self.max_episode_length) & (~done.all())
 
         def _body_func(carry):
-            counter, brax_state, done, total_reward = carry
-            action = self.batched_policy(weights, brax_state.obs)
+            counter, rollout_state, done, total_reward = carry
+            if self.stateful_policy:
+                state, brax_state = rollout_state
+                action, state = self.batched_policy(state, weights, brax_state.obs)
+                rollout_state = (state, brax_state)
+            else:
+                (brax_state,) = rollout_state
+                action = self.batched_policy(weights, brax_state.obs)
+                rollout_state = (brax_state,)
             brax_state = self.jit_env_step(brax_state, action)
             done = brax_state.done * (1 - done)
             total_reward += (1 - done) * brax_state.reward
-            return counter + 1, brax_state, done, total_reward
+            return counter + 1, rollout_state, done, total_reward
 
         brax_state = self.jit_reset(
             vmap_rng_split(jax.random.split(eval_key, self.num_episodes), pop_size)
         )
 
+        if self.stateful_policy:
+            rollout_state = (self.initial_state, brax_state)
+        else:
+            rollout_state = (brax_state,)
+
         # [pop_size, num_episodes]
         _, _, _, total_reward = jax.lax.while_loop(
             _cond_func,
             _body_func,
             (
                 0,
-                brax_state,
+                rollout_state,
                 jnp.zeros((pop_size, self.num_episodes)),
                 jnp.zeros((pop_size, self.num_episodes)),
             ),
diff --git a/src/evox/problems/neuroevolution/reinforcement_learning/gym.py b/src/evox/problems/neuroevolution/reinforcement_learning/gym.py
@@ -1,4 +1,4 @@
-from typing import Callable, Optional, List
+from typing import Callable, Optional, List, Any
 
 import gymnasium as gym
 import jax
@@ -58,10 +58,19 @@ def normalize_obvs(self, state, obvs):
 
 @ray.remote(num_cpus=1)
 class Worker:
-    def __init__(self, env_creator, policy=None, mo_keys=None):
+    def __init__(
+        self,
+        env_creator,
+        policy=None,
+        stateful_policy=False,
+        initial_state=None,
+        mo_keys=None,
+    ):
         self.envs = []
         self.env_creator = env_creator
         self.policy = policy
+        self.stateful_policy = stateful_policy
+        self.initial_state = initial_state
         self.mo_keys = mo_keys
 
     def step(self, actions):
@@ -124,9 +133,15 @@ def rollout(self, seed, subpop, cap_episode_length):
         assert self.policy is not None
         self.reset(seed, num_env)
         i = 0
+        policy_state = self.initial_state
         while True:
             observations = jnp.asarray(self.observations)
-            actions = np.asarray(self.policy(subpop, observations))
+            if self.stateful_policy:
+                actions = np.asarray(self.policy(subpop, observations))
+            else:
+                actions, policy_state = np.asarray(
+                    self.policy(policy_state, subpop, observations)
+                )
             self.step(actions)
 
             if np.all(self.terminated | self.truncated):
@@ -144,6 +159,8 @@ class Controller:
     def __init__(
         self,
         policy,
+        stateful_policy,
+        initial_state,
         num_workers,
         env_creator,
         worker_options,
@@ -155,11 +172,15 @@ def __init__(
             Worker.options(**worker_options).remote(
                 env_creator,
                 None if batch_policy else jit(vmap(policy)),
+                stateful_policy,
+                initial_state,
                 mo_keys,
             )
             for _ in range(num_workers)
         ]
         self.policy = policy
+        self.stateful_policy = stateful_policy
+        self.initial_state = initial_state
         self.batch_policy = batch_policy
         self.num_obj = len(mo_keys)
 
@@ -197,15 +218,22 @@ def _evaluate(self, seed, pop, cap_episode_length):
         return rewards, acc_mo_values, episode_length
 
     @jit_method
-    def batch_policy_evaluation(self, observations, pop):
-        actions = jax.vmap(self.policy)(
-            pop,
-            observations,
-        )
+    def batch_policy_evaluation(self, policy_state, observations, pop):
+        if self.stateful_policy:
+            actions = jax.vmap(self.policy)(
+                pop,
+                observations,
+            )
+        else:
+            actions, policy_state = jax.vmap(self.policy)(
+                policy_state,
+                pop,
+                observations,
+            )
         # reshape in order to distribute to different workers
         action_dim = actions.shape[1:]
         actions = jnp.array_split(actions, self.num_workers, axis=0)
-        return actions
+        return actions, policy_state
 
     def _batched_evaluate(self, seed, pop, cap_episode_length):
         pop_size = tree_batch_size(pop)
@@ -225,13 +253,18 @@ def _batched_evaluate(self, seed, pop, cap_episode_length):
         episode_length = 0
 
         i = 0
+        policy_state = self.initial_state
+        if self.stateful_policy:
+            policy_state = [policy_state for _ in range(pop_size)]
         while True:
             # flatten observations
             observations = [obs for worker_obs in observations for obs in worker_obs]
             observations = np.stack(observations, axis=0)
             observations = jnp.asarray(observations)
             # get action from policy
-            actions = self.batch_policy_evaluation(observations, pop)
+            actions, policy_state = self.batch_policy_evaluation(
+                policy_state, observations, pop
+            )
 
             futures = [
                 worker.step.remote(np.asarray(action))
@@ -294,6 +327,8 @@ def __init__(
         worker_options: dict = {},
         init_cap: Optional[int] = None,
         batch_policy: bool = False,
+        stateful_policy: bool = False,
+        initial_state: Any = None,
     ):
         """Construct a gym problem
 
@@ -334,6 +369,8 @@ def __init__(
         self.mo_keys = mo_keys
         self.controller = Controller.options(**controller_options).remote(
             policy,
+            stateful_policy,
+            initial_state,
             num_workers,
             env_creator,
             worker_options,
@@ -343,6 +380,8 @@ def __init__(
         self.num_workers = num_workers
         self.env_name = env_name
         self.policy = policy
+        self.stateful_policy = stateful_policy
+        self.initial_state = initial_state
         if init_cap is not None:
             self.cap_episode = CapEpisode(init_cap=init_cap)
         else: