Merge branch 'mujoco-v5' of https://github.com/Farama-Foundation/MO-G…

…ymnasium into mujoco-v5
Farama-Foundation · Oct 16, 2024 · 7931b98 · 7931b98
2 parents d615b48 + eab4592
commit 7931b98
Show file tree

Hide file tree

Showing 6 changed files with 312 additions and 0 deletions.
diff --git a/mo_gymnasium/envs/mujoco/__init__.py b/mo_gymnasium/envs/mujoco/__init__.py
@@ -45,6 +45,12 @@
     max_episode_steps=1000,
 )
 
+register(
+    id="mo-walker2d-v5",
+    entry_point="mo_gymnasium.envs.mujoco.walker2d_v5:MOWalker2dEnv",
+    max_episode_steps=1000,
+)
+
 register(
     id="mo-ant-v4",
     entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv",
@@ -58,20 +64,52 @@
     kwargs={"cost_objective": False},
 )
 
+
+register(
+    id="mo-ant-v5",
+    entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="mo-ant-2d-v5",
+    entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv",
+    max_episode_steps=1000,
+    kwargs={"cost_objective": False},
+)
+
 register(
     id="mo-swimmer-v4",
     entry_point="mo_gymnasium.envs.mujoco.swimmer:MOSwimmerEnv",
     max_episode_steps=1000,
 )
 
+register(
+    id="mo-swimmer-v5",
+    entry_point="mo_gymnasium.envs.mujoco.swimmer_v5:MOSwimmerEnv",
+    max_episode_steps=1000,
+)
+
 register(
     id="mo-humanoid-v4",
     entry_point="mo_gymnasium.envs.mujoco.humanoid:MOHumanoidEnv",
     max_episode_steps=1000,
 )
 
+register(
+    id="mo-humanoid-v5",
+    entry_point="mo_gymnasium.envs.mujoco.humanoid_v5:MOHumanoidEnv",
+    max_episode_steps=1000,
+)
+
 register(
     id="mo-reacher-v4",
     entry_point="mo_gymnasium.envs.mujoco.reacher_v4:MOReacherEnv",
     max_episode_steps=50,
 )
+
+register(
+    id="mo-reacher-v5",
+    entry_point="mo_gymnasium.envs.mujoco.reacher_v5:MOReacherEnv",
+    max_episode_steps=50,
+)
diff --git a/mo_gymnasium/envs/mujoco/ant_v5.py b/mo_gymnasium/envs/mujoco/ant_v5.py
@@ -0,0 +1,57 @@
+import numpy as np
+from gymnasium.envs.mujoco.ant_v5 import AntEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOAntEnv(AntEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the AntEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/ant/) for more information.
+
+    The original Gymnasium's 'Ant-v5' is recovered by the following linear scalarization:
+
+    env = mo_gym.make('mo-ant-v4', cost_objective=False)
+    LinearReward(env, weight=np.array([1.0, 0.0]))
+
+    ## Reward Space
+    The reward is 2- or 3-dimensional:
+    - 0: x-velocity
+    - 1: y-velocity
+    - 2: Control cost of the action
+    If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives.
+    A healthy reward is added to all objectives.
+
+    ## Version History
+    - v5: Now includes contact forces in the reward and observation.
+    See https://gymnasium.farama.org/environments/mujoco/ant/#version-history
+    """
+
+    def __init__(self, cost_objective=True, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, cost_objective, **kwargs)
+        self.cost_objetive = cost_objective
+        self.reward_dim = 3 if cost_objective else 2
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,))
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        x_velocity = info["x_velocity"]
+        y_velocity = info["y_velocity"]
+        cost = info["reward_ctrl"]
+        contact_cost = info["reward_contact"]
+        healthy_reward = info["reward_survive"]
+
+        if self.cost_objetive:
+            cost /= self._ctrl_cost_weight  # Ignore the weight in the original AntEnv
+            contact_cost /= self._contact_cost_weight
+            vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32)
+        else:
+            vec_reward = np.array([x_velocity, y_velocity], dtype=np.float32)
+            vec_reward += cost + contact_cost
+
+        vec_reward += healthy_reward
+
+        return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/humanoid_v5.py b/mo_gymnasium/envs/mujoco/humanoid_v5.py
@@ -0,0 +1,37 @@
+import numpy as np
+from gymnasium.envs.mujoco.humanoid_v5 import HumanoidEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOHumanoidEnv(HumanoidEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the HumanoidEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/humanoid/) for more information.
+
+    ## Reward Space
+    The reward is 2-dimensional:
+    - 0: Reward for running forward (x-velocity)
+    - 1: Control cost of the action
+
+    ## Version History:
+    - v5: Now includes contact forces. See: https://gymnasium.farama.org/environments/mujoco/humanoid/#version-history
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, **kwargs)
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+        self.reward_dim = 2
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        velocity = info["x_velocity"]
+        negative_cost = 10 * info["reward_ctrl"] + info["reward_contact"]
+        vec_reward = np.array([velocity, negative_cost], dtype=np.float32)
+
+        vec_reward += self.healthy_reward  # All objectives are penalyzed when the agent falls
+
+        return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/reacher_v5.py b/mo_gymnasium/envs/mujoco/reacher_v5.py
@@ -0,0 +1,101 @@
+from os import path
+
+import numpy as np
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.envs.mujoco.reacher_v5 import ReacherEnv
+from gymnasium.spaces import Box, Discrete
+
+
+DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0}
+
+
+class MOReacherEnv(ReacherEnv):
+    """
+    ## Description
+    Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/).
+
+    ## Observation Space
+    The observation is 6-dimensional and contains:
+    - sin and cos of the angles of the central and elbow joints
+    - angular velocity of the central and elbow joints
+
+    ## Action Space
+    The action space is discrete and contains the 3^2=9 possible actions based on applying positive (+1), negative (-1) or zero (0) torque to each of the two joints.
+
+    ## Reward Space
+    The reward is 4-dimensional and is defined based on the distance of the tip of the arm and the four target locations.
+    For each i={1,2,3,4} it is computed as:
+    ```math
+        r_i = 1  - 4 * || finger_tip_coord - target_i ||^2
+    ```
+
+    ## Version History:
+    See https://gymnasium.farama.org/environments/mujoco/reacher/#version-history
+    """
+
+    def __init__(self, **kwargs):
+        utils.EzPickle.__init__(self, **kwargs)
+        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(6,), dtype=np.float64)
+        MujocoEnv.__init__(
+            self,
+            path.join(path.dirname(__file__), "assets", "mo_reacher.xml"),
+            2,
+            observation_space=self.observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+        actions = [-1.0, 0.0, 1.0]
+        self.action_dict = dict()
+        for a1 in actions:
+            for a2 in actions:
+                self.action_dict[len(self.action_dict)] = (a1, a2)
+        self.action_space = Discrete(9)
+        # Target goals: x1, y1, x2, y2, ... x4, y4
+        self.goal = np.array([0.14, 0.0, -0.14, 0.0, 0.0, 0.14, 0.0, -0.14])
+        self.reward_space = Box(low=-1.0, high=1.0, shape=(4,))
+        self.reward_dim = 4
+
+    def step(self, a):
+        real_action = self.action_dict[int(a)]
+        vec_reward = np.array(
+            [
+                1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target1")[:2]),
+                1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target2")[:2]),
+                1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target3")[:2]),
+                1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target4")[:2]),
+            ],
+            dtype=np.float32,
+        )
+
+        self._step_mujoco_simulation(real_action, self.frame_skip)
+        if self.render_mode == "human":
+            self.render()
+
+        ob = self._get_obs()
+        return (
+            ob,
+            vec_reward,
+            False,
+            False,
+            {},
+        )
+
+    def reset_model(self):
+        qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos
+        qpos[:2] = np.array([0, 3.1415 / 2])  # init position
+        qpos[-len(self.goal) :] = self.goal
+        qvel = self.init_qvel + self.np_random.uniform(low=-0.005, high=0.005, size=self.model.nv)
+        qvel[-len(self.goal) :] = 0
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def _get_obs(self):
+        theta = self.data.qpos.flatten()[:2]
+        return np.concatenate(
+            [
+                np.cos(theta),
+                np.sin(theta),
+                self.data.qvel.flatten()[:2] * 0.1,
+            ]
+        )
diff --git a/mo_gymnasium/envs/mujoco/swimmer_v5.py b/mo_gymnasium/envs/mujoco/swimmer_v5.py
@@ -0,0 +1,41 @@
+import numpy as np
+from gymnasium.envs.mujoco.swimmer_v5 import SwimmerEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOSwimmerEnv(SwimmerEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the SwimmerEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/swimmer/) for more information.
+
+    The original Gymnasium's 'Swimmer-v4' is recovered by the following linear scalarization:
+
+    env = mo_gym.make('mo-swimmer-v4')
+    LinearReward(env, weight=np.array([1.0, 1e-4]))
+
+    ## Reward Space
+    The reward is 2-dimensional:
+    - 0: Reward for moving forward (x-velocity)
+    - 1: Control cost of the action
+
+    ## Version History:
+    See https://gymnasium.farama.org/main/environments/mujoco/swimmer/#version-history
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, **kwargs)
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+        self.reward_dim = 2
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        velocity = info["x_velocity"]
+        energy = -np.sum(np.square(action))
+
+        vec_reward = np.array([velocity, energy], dtype=np.float32)
+
+        return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/walker2d_v5.py b/mo_gymnasium/envs/mujoco/walker2d_v5.py
@@ -0,0 +1,38 @@
+import numpy as np
+from gymnasium.envs.mujoco.walker2d_v5 import Walker2dEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOWalker2dEnv(Walker2dEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the Walker2dEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/walker2d/) for more information.
+
+    ## Reward Space
+    The reward is 2-dimensional:
+    - 0: Reward for running forward (x-velocity)
+    - 1: Control cost of the action
+
+    # Version History
+    - See https://gymnasium.farama.org/main/environments/mujoco/walker2d/#version-history
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, **kwargs)
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+        self.reward_dim = 2
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        velocity = info["x_velocity"]
+        energy = -np.sum(np.square(action))
+
+        vec_reward = np.array([velocity, energy], dtype=np.float32)
+
+        vec_reward += self.healthy_reward  # All objectives are penalyzed when the agent falls
+
+        return observation, vec_reward, terminated, truncated, info