diff --git a/.github/workflows/build-publish.yml b/.github/workflows/build-publish.yml
index d44dc96d..a00c7e8e 100644
--- a/.github/workflows/build-publish.yml
+++ b/.github/workflows/build-publish.yml
@@ -4,7 +4,7 @@
# - https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
#
# derived from https://github.com/Farama-Foundation/PettingZoo/blob/e230f4d80a5df3baf9bd905149f6d4e8ce22be31/.github/workflows/build-publish.yml
-name: build-publish
+name: Build artifact for PyPI
on:
push:
@@ -16,35 +16,18 @@ on:
jobs:
build-wheels:
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- include:
- - os: ubuntu-latest
- python: 38
- platform: manylinux_x86_64
- - os: ubuntu-latest
- python: 39
- platform: manylinux_x86_64
- - os: ubuntu-latest
- python: 310
- platform: manylinux_x86_64
- - os: ubuntu-latest
- python: 311
- platform: manylinux_x86_64
+ runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version: '3.x'
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+
- name: Install dependencies
- run: python -m pip install --upgrade pip setuptools build
+ run: pipx install build
- name: Build sdist and wheels
- run: python -m build
+ run: pyproject-build
- name: Store wheels
- uses: actions/upload-artifact@v2
+ uses: actions/upload-artifact@v4
with:
path: dist
@@ -55,10 +38,11 @@ jobs:
if: github.event_name == 'release' && github.event.action == 'published'
steps:
- name: Download dists
- uses: actions/download-artifact@v2
+ uses: actions/download-artifact@v4
with:
name: artifact
path: dist
+
- name: Publish
uses: pypa/gh-action-pypi-publish@release/v1
with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 80ce02af..9f2cc2ab 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -13,9 +13,7 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
- - uses: actions/setup-python@v4
- - run: python -m pip install pre-commit
- - run: python -m pre_commit --version
- - run: python -m pre_commit install
- - run: python -m pre_commit run --all-files
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ - run: pipx install pre-commit
+ - run: pre-commit run --all-files
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cbbea960..05e72fd0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.4.0
+ rev: v5.0.0
hooks:
- id: check-symlinks
- id: destroyed-symlinks
@@ -17,13 +17,13 @@ repos:
- id: detect-private-key
- id: debug-statements
- repo: https://github.com/codespell-project/codespell
- rev: v2.2.4
+ rev: v2.3.0
hooks:
- id: codespell
args:
- --ignore-words-list=reacher, mor
- repo: https://github.com/PyCQA/flake8
- rev: 6.0.0
+ rev: 7.1.1
hooks:
- id: flake8
args:
@@ -34,16 +34,16 @@ repos:
- --show-source
- --statistics
- repo: https://github.com/asottile/pyupgrade
- rev: v3.3.1
+ rev: v3.18.0
hooks:
- id: pyupgrade
args: ["--py37-plus"]
- repo: https://github.com/PyCQA/isort
- rev: 5.12.0
+ rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/python/black
- rev: 23.1.0
+ rev: 24.10.0
hooks:
- id: black
- repo: https://github.com/pycqa/pydocstyle
diff --git a/README.md b/README.md
index fb5f7885..708bb3a6 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ obs, info = env.reset()
next_obs, vector_reward, terminated, truncated, info = env.step(your_agent.act(obs))
# Optionally, you can scalarize the reward function with the LinearReward wrapper
-env = mo_gym.LinearReward(env, weight=np.array([0.8, 0.2, 0.2]))
+env = mo_gym.wrappers.LinearReward(env, weight=np.array([0.8, 0.2, 0.2]))
```
For details on multi-objective MDP's (MOMDP's) and other MORL definitions, see [A practical guide to multi-objective reinforcement learning and planning](https://link.springer.com/article/10.1007/s10458-022-09552-y).
diff --git a/docs/_scripts/gen_env_docs.py b/docs/_scripts/gen_env_docs.py
index eec55219..87a2184e 100644
--- a/docs/_scripts/gen_env_docs.py
+++ b/docs/_scripts/gen_env_docs.py
@@ -41,7 +41,7 @@ def trim(docstring):
pattern = re.compile(r"(? | Continuous / Discrete | `[time_penalty, reverse_penalty, forward_penalty]` | Classic Mountain Car env, but with extra penalties for the forward and reverse actions. From [Vamplew et al. 2011](https://www.researchgate.net/publication/220343783_Empirical_evaluation_methods_for_multiobjective_reinforcement_learning_algorithms). |
-| [`mo-mountaincarcontinuous-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincarcontinuous/)
| Continuous / Continuous | `[time_penalty, fuel_consumption_penalty]` | Continuous Mountain Car env, but with penalties for fuel consumption. |
+[`mo-mountaincar-3d-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/) **
| Continuous / Discrete| `[time_penalty, move_penalty, speed_objective]` | The forward and backward penalties have been merged into the move penalty and a speed objective has been introduced which gives the positive reward equivalent to the car's speed at that time step.* |
+[`mo-mountaincar-timemove-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/) **
| Continuous / Discrete | `[time_penalty, move_penalty]`| Class Mountain Car env but an extra penalty for moving backwards or forwards merged into a move penalty. |
+[`mo-mountaincar-timespeed-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/) **
| Continuous / Discrete| `[time_penalty, speed_objective]` | Class Mountain Car env but an extra positive objective of speed which gives the positive reward equivalent to the car's speed at that time step.*
+| [`mo-mountaincarcontinuous-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincarcontinuous/)
| Continuous / Continuous | `[time_penalty, fuel_consumption_penalty]` | Continuous Mountain Car env, but with penalties for fuel consumption. |
| [`mo-lunar-lander-v2`](https://mo-gymnasium.farama.org/environments/mo-lunar-lander/)
| Continuous / Discrete or Continuous | `[landed, shaped_reward, main_engine_fuel, side_engine_fuel]` | MO version of the `LunarLander-v2` [environment](https://gymnasium.farama.org/environments/box2d/lunar_lander/). Objectives defined similarly as in [Hung et al. 2022](https://openreview.net/forum?id=AwWaBXLIJE). |
+*An additional objective was introduced to prevent the agent from converging to the local maxima due to a lack of reward signal for the static action.
+
+**Read more about these environments and the detailed reasoning behind them in [`Pranav Gupta's Dissertation`](https://drive.google.com/file/d/1yT6hlavYZGmoB2phaIBX_5hbibA3Illa/view?usp=sharing)
+
+
```{toctree}
:hidden:
:glob:
diff --git a/docs/index.md b/docs/index.md
index fb6d56ff..f1d24905 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -11,6 +11,7 @@ lastpage:
introduction/install
introduction/api
wrappers/wrappers
+wrappers/vector_wrappers
examples/morl_baselines
```
diff --git a/docs/wrappers/vector_wrappers.md b/docs/wrappers/vector_wrappers.md
new file mode 100644
index 00000000..ade24022
--- /dev/null
+++ b/docs/wrappers/vector_wrappers.md
@@ -0,0 +1,20 @@
+---
+title: "Vector Wrappers"
+---
+
+# Vector Wrappers
+
+Similar to the normal wrappers, MO-Gymnasium provides a few wrappers that are specifically designed to work with vectorized environments. They are all available directly from the `mo_gymnasium.wrappers.vector` module.
+
+
+## `MOSyncVectorEnv`
+
+```{eval-rst}
+.. autoclass:: mo_gymnasium.wrappers.vector.MOSyncVectorEnv
+```
+
+## `MORecordEpisodeStatistics`
+
+```{eval-rst}
+.. autoclass:: mo_gymnasium.wrappers.vector.MORecordEpisodeStatistics
+```
diff --git a/docs/wrappers/wrappers.md b/docs/wrappers/wrappers.md
index 542e5cca..acf2ab56 100644
--- a/docs/wrappers/wrappers.md
+++ b/docs/wrappers/wrappers.md
@@ -4,36 +4,36 @@ title: "Wrappers"
# Wrappers
-A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium` module.
+A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium.wrappers` module.
## `LinearReward`
```{eval-rst}
-.. autoclass:: mo_gymnasium.LinearReward
+.. autoclass:: mo_gymnasium.wrappers.LinearReward
```
## `MONormalizeReward`
```{eval-rst}
-.. autoclass:: mo_gymnasium.MONormalizeReward
+.. autoclass:: mo_gymnasium.wrappers.MONormalizeReward
```
## `MOClipReward`
```{eval-rst}
-.. autoclass:: mo_gymnasium.MOClipReward
+.. autoclass:: mo_gymnasium.wrappers.MOClipReward
```
-## `MOSyncVectorEnv`
+## `MORecordEpisodeStatistics`
```{eval-rst}
-.. autoclass:: mo_gymnasium.MOSyncVectorEnv
+.. autoclass:: mo_gymnasium.wrappers.MORecordEpisodeStatistics
```
-## `MORecordEpisodeStatistics`
+## `MOMaxAndSkipObservation`
```{eval-rst}
-.. autoclass:: mo_gymnasium.MORecordEpisodeStatistics
+.. autoclass:: mo_gymnasium.wrappers.MOMaxAndSkipObservation
```
diff --git a/mo_gymnasium/__init__.py b/mo_gymnasium/__init__.py
index 23201d0c..94fe1c92 100644
--- a/mo_gymnasium/__init__.py
+++ b/mo_gymnasium/__init__.py
@@ -2,16 +2,10 @@
# Envs
import mo_gymnasium.envs
+from mo_gymnasium import wrappers
# Utils
-from mo_gymnasium.utils import (
- LinearReward,
- MOClipReward,
- MONormalizeReward,
- MORecordEpisodeStatistics,
- MOSyncVectorEnv,
- make,
-)
+from mo_gymnasium.utils import make
-__version__ = "1.1.0"
+__version__ = "1.2.0"
diff --git a/mo_gymnasium/envs/__init__.py b/mo_gymnasium/envs/__init__.py
index 7e917397..c4846df6 100644
--- a/mo_gymnasium/envs/__init__.py
+++ b/mo_gymnasium/envs/__init__.py
@@ -10,6 +10,5 @@
import mo_gymnasium.envs.minecart
import mo_gymnasium.envs.mountain_car
import mo_gymnasium.envs.mujoco
-import mo_gymnasium.envs.reacher
import mo_gymnasium.envs.resource_gathering
import mo_gymnasium.envs.water_reservoir
diff --git a/mo_gymnasium/envs/fishwood/fishwood.py b/mo_gymnasium/envs/fishwood/fishwood.py
index fad02d77..7aa1242b 100644
--- a/mo_gymnasium/envs/fishwood/fishwood.py
+++ b/mo_gymnasium/envs/fishwood/fishwood.py
@@ -42,8 +42,8 @@ class FishWood(gym.Env, EzPickle):
"""
metadata = {"render_modes": ["human"]}
- FISH = 0
- WOOD = 1
+ FISH = np.array([0], dtype=np.int32)
+ WOOD = np.array([1], dtype=np.int32)
MAX_TS = 200
def __init__(self, render_mode: Optional[str] = None, fishproba=0.1, woodproba=0.9):
@@ -55,17 +55,17 @@ def __init__(self, render_mode: Optional[str] = None, fishproba=0.1, woodproba=0
self.action_space = spaces.Discrete(2) # 2 actions, go fish and go wood
# 2 states, fishing and in the woods
- self.observation_space = spaces.Discrete(2)
+ self.observation_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.int32)
# 2 objectives, amount of fish and amount of wood
self.reward_space = spaces.Box(low=np.array([0, 0]), high=np.array([1.0, 1.0]), dtype=np.float32)
self.reward_dim = 2
- self._state = self.WOOD
+ self._state = self.WOOD.copy()
def reset(self, seed=None, **kwargs):
super().reset(seed=seed)
- self._state = self.WOOD
+ self._state = self.WOOD.copy()
self._timestep = 0
if self.render_mode == "human":
self.render()
@@ -89,7 +89,7 @@ def step(self, action):
rewards[self.FISH] = 1.0
# Execute the action
- self._state = action
+ self._state = np.array([action], dtype=np.int32)
self._timestep += 1
if self.render_mode == "human":
diff --git a/mo_gymnasium/envs/lunar_lander/__init__.py b/mo_gymnasium/envs/lunar_lander/__init__.py
index d4435341..817671fb 100644
--- a/mo_gymnasium/envs/lunar_lander/__init__.py
+++ b/mo_gymnasium/envs/lunar_lander/__init__.py
@@ -2,13 +2,13 @@
register(
- id="mo-lunar-lander-v2",
+ id="mo-lunar-lander-v3",
entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander",
max_episode_steps=1000,
)
register(
- id="mo-lunar-lander-continuous-v2",
+ id="mo-lunar-lander-continuous-v3",
entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander",
max_episode_steps=1000,
kwargs={"continuous": True},
diff --git a/mo_gymnasium/envs/mario/joypad_space.py b/mo_gymnasium/envs/mario/joypad_space.py
index 73969eee..32fb0dfb 100644
--- a/mo_gymnasium/envs/mario/joypad_space.py
+++ b/mo_gymnasium/envs/mario/joypad_space.py
@@ -1,4 +1,5 @@
"""An environment wrapper to convert binary to discrete action space. This is a modified version of the original code from nes-py."""
+
from typing import List
import gymnasium as gym
diff --git a/mo_gymnasium/envs/mario/mario.py b/mo_gymnasium/envs/mario/mario.py
index b7279941..45924ed3 100644
--- a/mo_gymnasium/envs/mario/mario.py
+++ b/mo_gymnasium/envs/mario/mario.py
@@ -7,7 +7,6 @@
from gymnasium.utils import EzPickle, seeding
# from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv
-from gymnasium.wrappers import GrayScaleObservation, ResizeObservation
from nes_py.nes_env import SCREEN_SHAPE_24_BIT
import mo_gymnasium as mo_gym
@@ -16,7 +15,7 @@
from mo_gymnasium.envs.mario.joypad_space import JoypadSpace
-class MOSuperMarioBros(SuperMarioBrosEnv, EzPickle):
+class MOSuperMarioBros(SuperMarioBrosEnv, gym.Env, EzPickle):
"""
## Description
Multi-objective version of the SuperMarioBro environment.
@@ -202,11 +201,14 @@ def step(self, action):
if __name__ == "__main__":
+ from gymnasium.wrappers import ResizeObservation
+ from gymnasium.wrappers.transform_observation import GrayscaleObservation
+
env = MOSuperMarioBros()
env = JoypadSpace(env, SIMPLE_MOVEMENT)
# env = MaxAndSkipEnv(env, 4)
env = ResizeObservation(env, (84, 84))
- env = GrayScaleObservation(env)
+ env = GrayscaleObservation(env)
# env = FrameStack(env, 4)
env = mo_gym.LinearReward(env)
diff --git a/mo_gymnasium/envs/minecart/minecart.py b/mo_gymnasium/envs/minecart/minecart.py
index b5154192..cd6c64e7 100644
--- a/mo_gymnasium/envs/minecart/minecart.py
+++ b/mo_gymnasium/envs/minecart/minecart.py
@@ -249,9 +249,11 @@ def pareto_front(self, gamma: float, symmetric: bool = True) -> List[np.ndarray]
queue = [
{
"speed": ACCELERATION * self.frame_skip,
- "dist": mine_distance - self.frame_skip * (self.frame_skip + 1) / 2 * ACCELERATION
- if self.incremental_frame_skip
- else mine_distance - ACCELERATION * self.frame_skip * self.frame_skip,
+ "dist": (
+ mine_distance - self.frame_skip * (self.frame_skip + 1) / 2 * ACCELERATION
+ if self.incremental_frame_skip
+ else mine_distance - ACCELERATION * self.frame_skip * self.frame_skip
+ ),
"seq": [ACT_ACCEL],
}
]
diff --git a/mo_gymnasium/envs/mountain_car/__init__.py b/mo_gymnasium/envs/mountain_car/__init__.py
index f75fe751..523de281 100644
--- a/mo_gymnasium/envs/mountain_car/__init__.py
+++ b/mo_gymnasium/envs/mountain_car/__init__.py
@@ -6,3 +6,24 @@
entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
max_episode_steps=200,
)
+
+register(
+ id="mo-mountaincar-3d-v0",
+ entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
+ max_episode_steps=200,
+ kwargs={"add_speed_objective": True, "merge_move_penalty": True},
+)
+
+register(
+ id="mo-mountaincar-timemove-v0",
+ entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
+ max_episode_steps=200,
+ kwargs={"merge_move_penalty": True},
+)
+
+register(
+ id="mo-mountaincar-timespeed-v0",
+ entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
+ max_episode_steps=200,
+ kwargs={"remove_move_penalty": True, "add_speed_objective": True},
+)
diff --git a/mo_gymnasium/envs/mountain_car/mountain_car.py b/mo_gymnasium/envs/mountain_car/mountain_car.py
index 6e88acca..f49cf5ce 100644
--- a/mo_gymnasium/envs/mountain_car/mountain_car.py
+++ b/mo_gymnasium/envs/mountain_car/mountain_car.py
@@ -14,19 +14,50 @@ class MOMountainCar(MountainCarEnv, EzPickle):
See [Gymnasium's env](https://gymnasium.farama.org/environments/classic_control/mountain_car_continuous/) for more information.
## Reward space:
- The reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward.
+ By default, the reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward.
- time penalty: -1.0 for each time step
- reverse penalty: -1.0 for each time step the action is 0 (reverse)
- forward penalty: -1.0 for each time step the action is 2 (forward)
+
+ #Alternatively, the reward can be changed with the following options:
+ - add_speed_objective: Add an extra objective corresponding to the speed of the car.
+ - remove_move_penalty: Remove the reverse and forward objectives.
+ - merge_move_penalty: Merge reverse and forward penalties into a single penalty.
"""
- def __init__(self, render_mode: Optional[str] = None, goal_velocity=0):
+ def __init__(
+ self,
+ render_mode: Optional[str] = None,
+ add_speed_objective: bool = False,
+ remove_move_penalty: bool = False,
+ merge_move_penalty: bool = False,
+ goal_velocity=0,
+ ):
super().__init__(render_mode, goal_velocity)
- EzPickle.__init__(self, render_mode, goal_velocity)
+ EzPickle.__init__(self, render_mode, add_speed_objective, remove_move_penalty, merge_move_penalty, goal_velocity)
+ self.add_speed_objective = add_speed_objective
+ self.remove_move_penalty = remove_move_penalty
+ self.merge_move_penalty = merge_move_penalty
- self.reward_space = spaces.Box(low=np.array([-1, -1, -1]), high=np.array([-1, 0, 0]), shape=(3,), dtype=np.float32)
self.reward_dim = 3
+ if self.add_speed_objective:
+ self.reward_dim += 1
+
+ if self.remove_move_penalty:
+ self.reward_dim -= 2
+ elif self.merge_move_penalty:
+ self.reward_dim -= 1
+
+ low = np.array([-1] * self.reward_dim)
+ high = np.zeros(self.reward_dim)
+ high[0] = -1 # Time penalty is always -1
+ if self.add_speed_objective:
+ low[-1] = 0.0
+ high[-1] = 1.1
+
+ self.reward_space = spaces.Box(low=low, high=high, shape=(self.reward_dim,), dtype=np.float32)
+
def step(self, action: int):
assert self.action_space.contains(action), f"{action!r} ({type(action)}) invalid"
@@ -39,11 +70,20 @@ def step(self, action: int):
velocity = 0
terminated = bool(position >= self.goal_position and velocity >= self.goal_velocity)
- # reward = -1.0
- reward = np.zeros(3, dtype=np.float32)
+
+ reward = np.zeros(self.reward_dim, dtype=np.float32)
+
reward[0] = 0.0 if terminated else -1.0 # time penalty
- reward[1] = 0.0 if action != 0 else -1.0 # reverse penalty
- reward[2] = 0.0 if action != 2 else -1.0 # forward penalty
+
+ if not self.remove_move_penalty:
+ if self.merge_move_penalty:
+ reward[1] = 0.0 if action == 1 else -1.0
+ else:
+ reward[1] = 0.0 if action != 0 else -1.0 # reverse penalty
+ reward[2] = 0.0 if action != 2 else -1.0 # forward penalty
+
+ if self.add_speed_objective:
+ reward[-1] = 15 * abs(velocity)
self.state = (position, velocity)
if self.render_mode == "human":
diff --git a/mo_gymnasium/envs/mujoco/reacher_v4.py b/mo_gymnasium/envs/mujoco/reacher_v4.py
index 01a5bc9d..9596f64c 100644
--- a/mo_gymnasium/envs/mujoco/reacher_v4.py
+++ b/mo_gymnasium/envs/mujoco/reacher_v4.py
@@ -13,7 +13,7 @@
class MOReacherEnv(ReacherEnv):
"""
## Description
- Mujoco version of `mo-reacher-v0`, based on [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/).
+ Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/).
## Observation Space
The observation is 6-dimensional and contains:
diff --git a/mo_gymnasium/envs/reacher/__init__.py b/mo_gymnasium/envs/reacher/__init__.py
deleted file mode 100644
index b752382c..00000000
--- a/mo_gymnasium/envs/reacher/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from gymnasium.envs.registration import register
-
-
-register(
- id="mo-reacher-v0",
- entry_point="mo_gymnasium.envs.reacher.reacher:ReacherBulletEnv",
- max_episode_steps=100,
- kwargs={"fixed_initial_state": None},
-)
diff --git a/mo_gymnasium/envs/reacher/reacher.py b/mo_gymnasium/envs/reacher/reacher.py
deleted file mode 100644
index f881f512..00000000
--- a/mo_gymnasium/envs/reacher/reacher.py
+++ /dev/null
@@ -1,158 +0,0 @@
-from typing import Optional
-
-import numpy as np
-from gymnasium import spaces
-from gymnasium.utils import EzPickle, seeding
-from pybulletgym.envs.roboschool.envs.env_bases import BaseBulletEnv
-from pybulletgym.envs.roboschool.robots.robot_bases import MJCFBasedRobot
-from pybulletgym.envs.roboschool.scenes.scene_bases import SingleRobotEmptyScene
-
-
-target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)]))
-
-
-class ReacherBulletEnv(BaseBulletEnv, EzPickle):
- metadata = {"render_modes": ["human", "rgb_array"]}
-
- def __init__(
- self,
- render_mode: Optional[str] = None,
- target=(0.14, 0.0),
- fixed_initial_state: Optional[tuple] = (3.14, 0),
- ):
- EzPickle.__init__(self, render_mode, target, fixed_initial_state)
- self.robot = ReacherRobot(target, fixed_initial_state=fixed_initial_state)
- self.render_mode = render_mode
- BaseBulletEnv.__init__(self, self.robot, render=render_mode == "human")
- self._cam_dist = 0.75
-
- # self.target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14), (0.22, 0.0), (-0.22, 0.0), (0.0, 0.22), (0.0, -0.22), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)]))
- # self.target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)]))
- self.target_positions = list(
- map(
- lambda l: np.array(l),
- [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)],
- )
- )
-
- actions = [-1.0, 0.0, 1.0]
- self.action_dict = dict()
- for a1 in actions:
- for a2 in actions:
- self.action_dict[len(self.action_dict)] = (a1, a2)
-
- self.action_space = spaces.Discrete(9)
- self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32)
- self.reward_space = spaces.Box(low=-1.0, high=1.0, shape=(4,), dtype=np.float32)
- self.reward_dim = 4
-
- def create_single_player_scene(self, bullet_client):
- return SingleRobotEmptyScene(bullet_client, gravity=0.0, timestep=0.0165, frame_skip=1)
-
- def step(self, a):
- real_action = self.action_dict[int(a)]
-
- assert not self.scene.multiplayer
- self.robot.apply_action(real_action)
- self.scene.global_step()
-
- state = self.robot.calc_state() # sets self.to_target_vec
-
- """ delta = np.linalg.norm(np.array(self.robot.fingertip.pose().xyz()) - np.array(self.robot.target.pose().xyz()))
- reward = 1. - 4. * delta """
-
- phi = np.zeros(len(self.target_positions), dtype=np.float32)
- for index, target in enumerate(self.target_positions):
- delta = np.linalg.norm(np.array(self.robot.fingertip.pose().xyz()[:2]) - target)
- phi[index] = 1.0 - 4 * delta # 1 - 4
-
- self.HUD(state, real_action, False)
-
- if self.render_mode == "human":
- self._render(mode="human")
-
- return state, phi, False, False, {}
-
- def render(self):
- if self.render_mode == "human":
- self._render(mode="human")
- else:
- return self._render(mode="rgb_array")
-
- def camera_adjust(self):
- x, y, z = self.robot.fingertip.pose().xyz()
- x *= 0.5
- y *= 0.5
- self.camera.move_and_look_at(0.3, 0.3, 0.3, x, y, z)
-
- def reset(self, seed=None, **kwargs):
- self._seed(seed)
- if seed is not None:
- self._np_random, seed = seeding.np_random(seed)
- obs = super().reset()
- if self.render_mode == "human":
- self._render(mode="human")
- return obs, {}
-
-
-class ReacherRobot(MJCFBasedRobot):
- TARG_LIMIT = 0.27
-
- def __init__(self, target, fixed_initial_state=False):
- MJCFBasedRobot.__init__(self, "reacher.xml", "body0", action_dim=2, obs_dim=4)
- self.target_pos = target
- self.fixed_initial_state = fixed_initial_state
-
- def robot_specific_reset(self, bullet_client):
- self.jdict["target_x"].reset_current_position(target_positions[0][0], 0)
- self.jdict["target_y"].reset_current_position(target_positions[0][1], 0)
-
- """ self.jdict["target2_x"].reset_current_position(target_positions[1][0], 0)
- self.jdict["target2_y"].reset_current_position(target_positions[1][1], 0)
- self.jdict["target3_x"].reset_current_position(target_positions[2][0], 0)
- self.jdict["target3_y"].reset_current_position(target_positions[2][1], 0)
- self.jdict["target4_x"].reset_current_position(target_positions[3][0], 0)
- self.jdict["target4_y"].reset_current_position(target_positions[3][1], 0) """
-
- self.fingertip = self.parts["fingertip"]
- self.target = self.parts["target"]
- self.central_joint = self.jdict["joint0"]
- self.elbow_joint = self.jdict["joint1"]
- if self.fixed_initial_state is None:
- self.central_joint.reset_current_position(self.np_random.uniform(low=-3.14, high=3.14), 0)
- self.elbow_joint.reset_current_position(self.np_random.uniform(low=-3.14 / 2, high=3.14 / 2), 0)
- else:
- self.central_joint.reset_current_position(0, 0)
- self.elbow_joint.reset_current_position(self.fixed_initial_state[0], self.fixed_initial_state[1])
-
- def apply_action(self, a):
- assert np.isfinite(a).all()
- self.central_joint.set_motor_torque(0.05 * float(np.clip(a[0], -1, +1)))
- self.elbow_joint.set_motor_torque(0.05 * float(np.clip(a[1], -1, +1)))
-
- def calc_state(self):
- theta, self.theta_dot = self.central_joint.current_relative_position()
- self.gamma, self.gamma_dot = self.elbow_joint.current_relative_position()
- # target_x, _ = self.jdict["target_x"].current_position()
- # target_y, _ = self.jdict["target_y"].current_position()
- self.to_target_vec = np.array(self.fingertip.pose().xyz()) - np.array(self.target.pose().xyz())
- return np.array(
- [
- np.cos(theta),
- np.sin(theta),
- self.theta_dot * 0.1,
- self.gamma,
- self.gamma_dot * 0.1,
- ],
- dtype=np.float32,
- )
-
-
-if __name__ == "__main__":
- env = ReacherBulletEnv()
- # env.render(mode='human')
- obs = env.reset()
- print(env.observation_space.contains(obs), obs.dtype, env.observation_space)
- while True:
- env.step(env.action_space.sample())
- # env.render(mode='human')
diff --git a/mo_gymnasium/py.typed b/mo_gymnasium/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/mo_gymnasium/utils.py b/mo_gymnasium/utils.py
index def90471..a5c41a14 100644
--- a/mo_gymnasium/utils.py
+++ b/mo_gymnasium/utils.py
@@ -1,14 +1,8 @@
-"""Utilities function such as wrappers."""
+"""Utilities functions."""
-import time
-from copy import deepcopy
-from typing import Iterator, Tuple, TypeVar
+from typing import TypeVar
import gymnasium as gym
-import numpy as np
-from gymnasium.vector import SyncVectorEnv
-from gymnasium.wrappers.normalize import RunningMeanStd
-from gymnasium.wrappers.record_episode_statistics import RecordEpisodeStatistics
ObsType = TypeVar("ObsType")
@@ -26,337 +20,3 @@ def make(env_name: str, disable_env_checker: bool = True, **kwargs) -> gym.Env:
"""
"""Disable env checker, as it requires the reward to be a scalar."""
return gym.make(env_name, disable_env_checker=disable_env_checker, **kwargs)
-
-
-class LinearReward(gym.Wrapper, gym.utils.RecordConstructorArgs):
- """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector."""
-
- def __init__(self, env: gym.Env, weight: np.ndarray = None):
- """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector.
-
- Args:
- env: env to wrap
- weight: weight vector to use in the dot product
- """
- gym.utils.RecordConstructorArgs.__init__(self, weight=weight)
- gym.Wrapper.__init__(self, env)
- if weight is None:
- weight = np.ones(shape=env.unwrapped.reward_space.shape)
- self.set_weight(weight)
-
- def set_weight(self, weight: np.ndarray):
- """Changes weights for the scalarization.
-
- Args:
- weight: new weights to set
- Returns: nothing
- """
- assert weight.shape == self.env.unwrapped.reward_space.shape, "Reward weight has different shape than reward vector."
- self.w = weight
-
- def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]:
- """Steps in the environment.
-
- Args:
- action: action to perform
- Returns: obs, scalarized_reward, terminated, truncated, info
- """
- observation, reward, terminated, truncated, info = self.env.step(action)
- scalar_reward = np.dot(reward, self.w)
- info["vector_reward"] = reward
- info["reward_weights"] = self.w
-
- return observation, scalar_reward, terminated, truncated, info
-
-
-class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs):
- """Wrapper to normalize the reward component at index idx. Does not touch other reward components."""
-
- def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8):
- """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
-
- Args:
- env (env): The environment to apply the wrapper
- idx (int): the index of the reward to normalize
- epsilon (float): A stability parameter
- gamma (float): The discount factor that is used in the exponential moving average.
- """
- gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon)
- gym.Wrapper.__init__(self, env)
- self.idx = idx
- self.num_envs = getattr(env, "num_envs", 1)
- self.is_vector_env = getattr(env, "is_vector_env", False)
- self.return_rms = RunningMeanStd(shape=())
- self.returns = np.zeros(self.num_envs)
- self.gamma = gamma
- self.epsilon = epsilon
-
- def step(self, action: ActType):
- """Steps through the environment, normalizing the rewards returned.
-
- Args:
- action: action to perform
- Returns: obs, normalized_rewards, terminated, truncated, infos
- """
- obs, rews, terminated, truncated, infos = self.env.step(action)
- # Extracts the objective value to normalize
- to_normalize = rews[self.idx]
- if not self.is_vector_env:
- to_normalize = np.array([to_normalize])
- self.returns = self.returns * self.gamma + to_normalize
- # Defer normalization to gym implementation
- to_normalize = self.normalize(to_normalize)
- self.returns[terminated] = 0.0
- if not self.is_vector_env:
- to_normalize = to_normalize[0]
- # Injecting the normalized objective value back into the reward vector
- rews[self.idx] = to_normalize
- return obs, rews, terminated, truncated, infos
-
- def normalize(self, rews):
- """Normalizes the rewards with the running mean rewards and their variance.
-
- Args:
- rews: rewards
- Returns: the normalized reward
- """
- self.return_rms.update(self.returns)
- return rews / np.sqrt(self.return_rms.var + self.epsilon)
-
-
-class MOClipReward(gym.RewardWrapper, gym.utils.RecordConstructorArgs):
- """Clip reward[idx] to [min, max]."""
-
- def __init__(self, env: gym.Env, idx: int, min_r, max_r):
- """Clip reward[idx] to [min, max].
-
- Args:
- env: environment to wrap
- idx: index of the MO reward to clip
- min_r: min reward
- max_r: max reward
- """
- gym.utils.RecordConstructorArgs.__init__(self, idx=idx, min_r=min_r, max_r=max_r)
- gym.RewardWrapper.__init__(self, env)
- self.idx = idx
- self.min_r = min_r
- self.max_r = max_r
-
- def reward(self, reward):
- """Clips the reward at the given index.
-
- Args:
- reward: reward to clip.
- Returns: the clipped reward.
- """
- reward[self.idx] = np.clip(reward[self.idx], self.min_r, self.max_r)
- return reward
-
-
-class MOSyncVectorEnv(SyncVectorEnv):
- """Vectorized environment that serially runs multiple environments."""
-
- def __init__(
- self,
- env_fns: Iterator[callable],
- copy: bool = True,
- ):
- """Vectorized environment that serially runs multiple environments.
-
- Args:
- env_fns: env constructors
- copy: If ``True``, then the :meth:`reset` and :meth:`step` methods return a copy of the observations.
- """
- SyncVectorEnv.__init__(self, env_fns, copy=copy)
- # Just overrides the rewards memory to add the number of objectives
- self.reward_space = self.envs[0].unwrapped.reward_space
- self._rewards = np.zeros(
- (
- self.num_envs,
- self.reward_space.shape[0],
- ),
- dtype=np.float64,
- )
-
-
-class MORecordEpisodeStatistics(RecordEpisodeStatistics, gym.utils.RecordConstructorArgs):
- """This wrapper will keep track of cumulative rewards and episode lengths.
-
- After the completion of an episode, ``info`` will look like this::
-
- >>> info = {
- ... "episode": {
- ... "r": "",
- ... "dr": "",
- ... "l": "", # contrary to Gymnasium, these are not a numpy array
- ... "t": ""
- ... },
- ... }
-
- For a vectorized environments the output will be in the form of (be careful to first wrap the env into vector before applying MORewordStatistics)::
-
- >>> infos = {
- ... "final_observation": "",
- ... "_final_observation": "",
- ... "final_info": "",
- ... "_final_info": "",
- ... "episode": {
- ... "r": "",
- ... "dr": "",
- ... "l": "",
- ... "t": ""
- ... },
- ... "_episode": ""
- ... }
- """
-
- def __init__(self, env: gym.Env, gamma: float = 1.0, deque_size: int = 100):
- """This wrapper will keep track of cumulative rewards and episode lengths.
-
- Args:
- env (Env): The environment to apply the wrapper
- gamma (float): Discounting factor
- deque_size: The size of the buffers :attr:`return_queue` and :attr:`length_queue`
- """
- gym.utils.RecordConstructorArgs.__init__(self, gamma=gamma, deque_size=deque_size)
- RecordEpisodeStatistics.__init__(self, env, deque_size=deque_size)
- # CHANGE: Here we just override the standard implementation to extend to MO
- # We also take care of the case where the env is vectorized
- self.reward_dim = self.env.unwrapped.reward_space.shape[0]
- if self.is_vector_env:
- self.rewards_shape = (self.num_envs, self.reward_dim)
- else:
- self.rewards_shape = (self.reward_dim,)
- self.gamma = gamma
-
- def reset(self, **kwargs):
- """Resets the environment using kwargs and resets the episode returns and lengths."""
- obs, info = super().reset(**kwargs)
-
- # CHANGE: Here we just override the standard implementation to extend to MO
- self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32)
- self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32)
-
- return obs, info
-
- def step(self, action):
- """Steps through the environment, recording the episode statistics."""
- # This is very close the code from the RecordEpisodeStatistics wrapper from gym.
- (
- observations,
- rewards,
- terminations,
- truncations,
- infos,
- ) = self.env.step(action)
- assert isinstance(
- infos, dict
- ), f"`info` dtype is {type(infos)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order."
- self.episode_returns += rewards
- self.episode_lengths += 1
-
- # CHANGE: The discounted returns are also computed here
- self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape(
- self.episode_returns.shape
- )
-
- dones = np.logical_or(terminations, truncations)
- num_dones = np.sum(dones)
- if num_dones:
- if "episode" in infos or "_episode" in infos:
- raise ValueError("Attempted to add episode stats when they already exist")
- else:
- episode_return = np.zeros(self.rewards_shape, dtype=np.float32)
- disc_episode_return = np.zeros(self.rewards_shape, dtype=np.float32)
- if self.is_vector_env:
- for i in range(self.num_envs):
- if dones[i]:
- # CHANGE: Makes a deepcopy to avoid subsequent mutations
- episode_return[i] = deepcopy(self.episode_returns[i])
- disc_episode_return[i] = deepcopy(self.disc_episode_returns[i])
- else:
- episode_return = deepcopy(self.episode_returns)
- disc_episode_return = deepcopy(self.disc_episode_returns)
-
- length_eps = np.where(dones, self.episode_lengths, 0)
- time_eps = np.where(
- dones,
- np.round(time.perf_counter() - self.episode_start_times, 6),
- 0.0,
- )
-
- infos["episode"] = {
- "r": episode_return,
- "dr": disc_episode_return,
- "l": length_eps[0] if not self.is_vector_env else length_eps,
- "t": time_eps[0] if not self.is_vector_env else time_eps,
- }
- if self.is_vector_env:
- infos["_episode"] = np.where(dones, True, False)
- self.return_queue.extend(self.episode_returns[dones])
- self.length_queue.extend(self.episode_lengths[dones])
- self.episode_count += num_dones
- self.episode_lengths[dones] = 0
- self.episode_returns[dones] = np.zeros(self.reward_dim, dtype=np.float32)
- self.disc_episode_returns[dones] = np.zeros(self.reward_dim, dtype=np.float32)
- self.episode_start_times[dones] = time.perf_counter()
- return (
- observations,
- rewards,
- terminations,
- truncations,
- infos,
- )
-
-
-class MOMaxAndSkipObservation(gym.Wrapper):
- """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last observations.
-
- Note: This wrapper is based on the wrapper from stable-baselines3: https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html#MaxAndSkipEnv
- """
-
- def __init__(self, env: gym.Env[ObsType, ActType], skip: int = 4):
- """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last frames.
-
- Args:
- env (Env): The environment to apply the wrapper
- skip: The number of frames to skip
- """
- gym.Wrapper.__init__(self, env)
-
- if not np.issubdtype(type(skip), np.integer):
- raise TypeError(f"The skip is expected to be an integer, actual type: {type(skip)}")
- if skip < 2:
- raise ValueError(f"The skip value needs to be equal or greater than two, actual value: {skip}")
- if env.observation_space.shape is None:
- raise ValueError("The observation space must have the shape attribute.")
-
- self._skip = skip
- self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype)
-
- def step(self, action):
- """Step the environment with the given action for ``skip`` steps.
-
- Repeat action, sum reward, and max over last observations.
-
- Args:
- action: The action to step through the environment with
- Returns:
- Max of the last two observations, reward, terminated, truncated, and info from the environment
- """
- total_reward = np.zeros(self.env.unwrapped.reward_dim, dtype=np.float32)
- terminated = truncated = False
- info = {}
- for i in range(self._skip):
- obs, reward, terminated, truncated, info = self.env.step(action)
- done = terminated or truncated
- if i == self._skip - 2:
- self._obs_buffer[0] = obs
- if i == self._skip - 1:
- self._obs_buffer[1] = obs
- total_reward += reward
- if done:
- break
- max_frame = self._obs_buffer.max(axis=0)
-
- return max_frame, total_reward, terminated, truncated, info
diff --git a/mo_gymnasium/wrappers/__init__.py b/mo_gymnasium/wrappers/__init__.py
new file mode 100644
index 00000000..274241a0
--- /dev/null
+++ b/mo_gymnasium/wrappers/__init__.py
@@ -0,0 +1,10 @@
+"""Contains all wrappers (vectors or not)."""
+
+from mo_gymnasium.wrappers import vector
+from mo_gymnasium.wrappers.wrappers import (
+ LinearReward,
+ MOClipReward,
+ MOMaxAndSkipObservation,
+ MONormalizeReward,
+ MORecordEpisodeStatistics,
+)
diff --git a/mo_gymnasium/wrappers/vector/__init__.py b/mo_gymnasium/wrappers/vector/__init__.py
new file mode 100644
index 00000000..60225b17
--- /dev/null
+++ b/mo_gymnasium/wrappers/vector/__init__.py
@@ -0,0 +1,6 @@
+"""Vector wrappers."""
+
+from mo_gymnasium.wrappers.vector.wrappers import (
+ MORecordEpisodeStatistics,
+ MOSyncVectorEnv,
+)
diff --git a/mo_gymnasium/wrappers/vector/wrappers.py b/mo_gymnasium/wrappers/vector/wrappers.py
new file mode 100644
index 00000000..6028061d
--- /dev/null
+++ b/mo_gymnasium/wrappers/vector/wrappers.py
@@ -0,0 +1,227 @@
+"""Vector wrappers."""
+
+import time
+from copy import deepcopy
+from typing import Any, Dict, Iterator, Tuple
+
+import gymnasium as gym
+import numpy as np
+from gymnasium.core import ActType, ObsType
+from gymnasium.vector import SyncVectorEnv
+from gymnasium.vector.utils import concatenate, iterate
+from gymnasium.vector.vector_env import ArrayType, VectorEnv
+from gymnasium.wrappers.vector import RecordEpisodeStatistics
+
+
+class MOSyncVectorEnv(SyncVectorEnv):
+ """Vectorized environment that serially runs multiple environments.
+
+ Example:
+ >>> import mo_gymnasium as mo_gym
+
+ >>> envs = mo_gym.wrappers.vector.MOSyncVectorEnv([
+ ... lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(4)
+ ... ])
+ >>> envs
+ MOSyncVectorEnv(num_envs=4)
+ >>> obs, infos = envs.reset()
+ >>> obs
+ array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype=int32)
+ >>> _ = envs.action_space.seed(42)
+ >>> actions = envs.action_space.sample()
+ >>> obs, rewards, terminateds, truncateds, infos = envs.step([0, 1, 2, 3])
+ >>> obs
+ array([[0, 0], [1, 0], [0, 0], [0, 3]], dtype=int32)
+ >>> rewards
+ array([[0., -1.], [0.7, -1.], [0., -1.], [0., -1.]], dtype=float32)
+ >>> terminateds
+ array([False, True, False, False])
+ """
+
+ def __init__(
+ self,
+ env_fns: Iterator[callable],
+ copy: bool = True,
+ ):
+ """Vectorized environment that serially runs multiple environments.
+
+ Args:
+ env_fns: env constructors
+ copy: If ``True``, then the :meth:`reset` and :meth:`step` methods return a copy of the observations.
+ """
+ SyncVectorEnv.__init__(self, env_fns, copy=copy)
+ # Just overrides the rewards memory to add the number of objectives
+ self.reward_space = self.envs[0].unwrapped.reward_space
+ self._rewards = np.zeros(
+ (
+ self.num_envs,
+ self.reward_space.shape[0],
+ ),
+ dtype=np.float32,
+ )
+
+ def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayType, Dict[str, Any]]:
+ """Steps through each of the environments returning the batched results.
+
+ Returns:
+ The batched environment step results
+ """
+ actions = iterate(self.action_space, actions)
+
+ observations, infos = [], {}
+ for i, action in enumerate(actions):
+ if self._autoreset_envs[i]:
+ env_obs, env_info = self.envs[i].reset()
+
+ self._rewards[i] = np.zeros(self.reward_space.shape[0]) # This overrides Gymnasium's implem
+ self._terminations[i] = False
+ self._truncations[i] = False
+ else:
+ (
+ env_obs,
+ self._rewards[i],
+ self._terminations[i],
+ self._truncations[i],
+ env_info,
+ ) = self.envs[
+ i
+ ].step(action)
+
+ observations.append(env_obs)
+ infos = self._add_info(infos, env_info, i)
+
+ # Concatenate the observations
+ self._observations = concatenate(self.single_observation_space, observations, self._observations)
+ self._autoreset_envs = np.logical_or(self._terminations, self._truncations)
+
+ return (
+ deepcopy(self._observations) if self.copy else self._observations,
+ np.copy(self._rewards),
+ np.copy(self._terminations),
+ np.copy(self._truncations),
+ infos,
+ )
+
+
+class MORecordEpisodeStatistics(RecordEpisodeStatistics):
+ """This wrapper will keep track of cumulative rewards and episode lengths.
+
+ At the end of any episode within the vectorized env, the statistics of the episode
+ will be added to ``info`` using the key ``episode``, and the ``_episode`` key
+ is used to indicate the environment index which has a terminated or truncated episode.
+
+ For a vectorized environments the output will be in the form of (be careful to first wrap the env into vector before applying MORewordStatistics)::
+
+ >>> infos = { # doctest: +SKIP
+ ... "episode": {
+ ... "r": "",
+ ... "dr": "",
+ ... "l": "",
+ ... "t": ""
+ ... },
+ ... "_episode": ""
+ ... }
+
+ Moreover, the most recent rewards and episode lengths are stored in buffers that can be accessed via
+ :attr:`wrapped_env.return_queue` and :attr:`wrapped_env.length_queue` respectively.
+
+ Attributes:
+ return_queue: The cumulative rewards of the last ``deque_size``-many episodes
+ length_queue: The lengths of the last ``deque_size``-many episodes
+ """
+
+ def __init__(
+ self,
+ env: VectorEnv,
+ gamma: float = 1.0,
+ buffer_length: int = 100,
+ stats_key: str = "episode",
+ ):
+ """This wrapper will keep track of cumulative rewards and episode lengths.
+
+ Args:
+ env (Env): The environment to apply the wrapper
+ gamma: The discount factor
+ buffer_length: The size of the buffers :attr:`return_queue`, :attr:`length_queue` and :attr:`time_queue`
+ stats_key: The info key to save the data
+ """
+ gym.utils.RecordConstructorArgs.__init__(self, buffer_length=buffer_length, stats_key=stats_key)
+ RecordEpisodeStatistics.__init__(self, env, buffer_length=buffer_length, stats_key=stats_key)
+ self.disc_episode_returns = None
+ self.reward_dim = self.env.unwrapped.reward_space.shape[0]
+ self.rewards_shape = (self.num_envs, self.reward_dim)
+ self.gamma = gamma
+
+ def reset(self, **kwargs):
+ """Resets the environment using kwargs and resets the episode returns and lengths."""
+ obs, info = super().reset(**kwargs)
+
+ # CHANGE: Here we just override the standard implementation to extend to MO
+ self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32)
+ self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32)
+
+ return obs, info
+
+ def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayType, Dict[str, Any]]:
+ """Steps through the environment, recording the episode statistics."""
+ (
+ observations,
+ rewards,
+ terminations,
+ truncations,
+ infos,
+ ) = self.env.step(actions)
+
+ assert isinstance(
+ infos, dict
+ ), f"`vector.RecordEpisodeStatistics` requires `info` type to be `dict`, its actual type is {type(infos)}. This may be due to usage of other wrappers in the wrong order."
+
+ self.episode_returns[self.prev_dones] = 0
+ self.episode_lengths[self.prev_dones] = 0
+ self.episode_start_times[self.prev_dones] = time.perf_counter()
+ self.episode_returns[~self.prev_dones] += rewards[~self.prev_dones]
+
+ # CHANGE: The discounted returns are also computed here
+ self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape(
+ self.episode_returns.shape
+ )
+ self.episode_lengths[~self.prev_dones] += 1
+
+ self.prev_dones = dones = np.logical_or(terminations, truncations)
+ num_dones = np.sum(dones)
+ if num_dones:
+ if self._stats_key in infos or f"_{self._stats_key}" in infos:
+ raise ValueError(f"Attempted to add episode stats when they already exist, info keys: {list(infos.keys())}")
+ else:
+ # CHANGE to handle the vectorial reward and do deepcopies
+ episode_return = np.zeros(self.rewards_shape, dtype=np.float32)
+ disc_episode_return = np.zeros(self.rewards_shape, dtype=np.float32)
+
+ for i in range(self.num_envs):
+ if dones[i]:
+ episode_return[i] = np.copy(self.episode_returns[i])
+ disc_episode_return[i] = np.copy(self.disc_episode_returns[i])
+
+ episode_time_length = np.round(time.perf_counter() - self.episode_start_times, 6)
+ infos[self._stats_key] = {
+ "r": episode_return,
+ "dr": disc_episode_return,
+ "l": np.where(dones, self.episode_lengths, 0),
+ "t": np.where(dones, episode_time_length, 0.0),
+ }
+ infos[f"_{self._stats_key}"] = dones
+
+ self.episode_count += num_dones
+
+ for i in np.where(dones):
+ self.time_queue.extend(episode_time_length[i])
+ self.return_queue.extend(self.episode_returns[i])
+ self.length_queue.extend(self.episode_lengths[i])
+
+ return (
+ observations,
+ rewards,
+ terminations,
+ truncations,
+ infos,
+ )
diff --git a/mo_gymnasium/wrappers/wrappers.py b/mo_gymnasium/wrappers/wrappers.py
new file mode 100644
index 00000000..f7830865
--- /dev/null
+++ b/mo_gymnasium/wrappers/wrappers.py
@@ -0,0 +1,305 @@
+"""Wrappers."""
+
+import time
+from copy import deepcopy
+from typing import Tuple, TypeVar
+
+import gymnasium as gym
+import numpy as np
+from gymnasium.wrappers.common import RecordEpisodeStatistics
+from gymnasium.wrappers.utils import RunningMeanStd
+
+
+ObsType = TypeVar("ObsType")
+ActType = TypeVar("ActType")
+
+
+class LinearReward(gym.Wrapper, gym.utils.RecordConstructorArgs):
+ """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector."""
+
+ def __init__(self, env: gym.Env, weight: np.ndarray = None):
+ """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector.
+
+ Args:
+ env: env to wrap
+ weight: weight vector to use in the dot product
+ """
+ gym.utils.RecordConstructorArgs.__init__(self, weight=weight)
+ gym.Wrapper.__init__(self, env)
+ if weight is None:
+ weight = np.ones(shape=env.unwrapped.reward_space.shape)
+ self.set_weight(weight)
+
+ def set_weight(self, weight: np.ndarray):
+ """Changes weights for the scalarization.
+
+ Args:
+ weight: new weights to set
+ Returns: nothing
+ """
+ assert weight.shape == self.env.unwrapped.reward_space.shape, "Reward weight has different shape than reward vector."
+ self.w = weight
+
+ def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]:
+ """Steps in the environment.
+
+ Args:
+ action: action to perform
+ Returns: obs, scalarized_reward, terminated, truncated, info
+ """
+ observation, reward, terminated, truncated, info = self.env.step(action)
+ scalar_reward = np.dot(reward, self.w)
+ info["vector_reward"] = reward
+ info["reward_weights"] = self.w
+
+ return observation, scalar_reward, terminated, truncated, info
+
+
+class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs):
+ """Wrapper to normalize the reward component at index idx. Does not touch other reward components.
+
+ This code is heavily inspired on Gymnasium's except that it extracts the reward component at given idx, normalizes it, and reinjects it.
+
+ (!) This smoothes the moving average of the reward, which can be useful for training stability. But it does not "normalize" the reward in the sense of making it have a mean of 0 and a standard deviation of 1.
+
+ Example:
+ >>> import mo_gymnasium as mo_gym
+ >>> from mo_gymnasium.wrappers import MONormalizeReward
+ >>> env = mo_gym.make("deep-sea-treasure-v0")
+ >>> norm_treasure_env = MONormalizeReward(env, idx=0)
+ >>> both_norm_env = MONormalizeReward(norm_treasure_env, idx=1)
+ >>> both_norm_env.reset() # This one normalizes both rewards
+
+ """
+
+ def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8):
+ """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
+
+ Args:
+ env (env): The environment to apply the wrapper
+ idx (int): the index of the reward to normalize
+ epsilon (float): A stability parameter
+ gamma (float): The discount factor that is used in the exponential moving average.
+ """
+ gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon)
+ gym.Wrapper.__init__(self, env)
+ self.idx = idx
+ self.return_rms = RunningMeanStd(shape=())
+ self.discounted_reward: np.array = np.array([0.0])
+ self.gamma = gamma
+ self.epsilon = epsilon
+ self._update_running_mean = True
+
+ @property
+ def update_running_mean(self) -> bool:
+ """Property to freeze/continue the running mean calculation of the reward statistics."""
+ return self._update_running_mean
+
+ @update_running_mean.setter
+ def update_running_mean(self, setting: bool):
+ """Sets the property to freeze/continue the running mean calculation of the reward statistics."""
+ self._update_running_mean = setting
+
+ def step(self, action: ActType):
+ """Steps through the environment, normalizing the rewards returned.
+
+ Args:
+ action: action to perform
+ Returns: obs, normalized_rewards, terminated, truncated, infos
+ """
+ obs, rews, terminated, truncated, infos = self.env.step(action)
+ # Extracts the objective value to normalize
+ to_normalize = rews[self.idx]
+
+ self.discounted_reward = self.discounted_reward * self.gamma * (1 - terminated) + float(to_normalize)
+ if self._update_running_mean:
+ self.return_rms.update(self.discounted_reward)
+
+ # We don't (reward - self.return_rms.mean) see https://github.com/openai/baselines/issues/538
+ normalized_reward = to_normalize / np.sqrt(self.return_rms.var + self.epsilon)
+
+ # Injecting the normalized objective value back into the reward vector
+ rews[self.idx] = normalized_reward
+ return obs, rews, terminated, truncated, infos
+
+
+class MOClipReward(gym.RewardWrapper, gym.utils.RecordConstructorArgs):
+ """Clip reward[idx] to [min, max]."""
+
+ def __init__(self, env: gym.Env, idx: int, min_r, max_r):
+ """Clip reward[idx] to [min, max].
+
+ Args:
+ env: environment to wrap
+ idx: index of the MO reward to clip
+ min_r: min reward
+ max_r: max reward
+ """
+ gym.utils.RecordConstructorArgs.__init__(self, idx=idx, min_r=min_r, max_r=max_r)
+ gym.RewardWrapper.__init__(self, env)
+ self.idx = idx
+ self.min_r = min_r
+ self.max_r = max_r
+
+ def reward(self, reward):
+ """Clips the reward at the given index.
+
+ Args:
+ reward: reward to clip.
+ Returns: the clipped reward.
+ """
+ reward[self.idx] = np.clip(reward[self.idx], self.min_r, self.max_r)
+ return reward
+
+
+class MORecordEpisodeStatistics(RecordEpisodeStatistics, gym.utils.RecordConstructorArgs):
+ """This wrapper will keep track of cumulative rewards and episode lengths.
+
+ After the completion of an episode, ``info`` will look like this::
+
+ >>> info = {
+ ... "episode": {
+ ... "r": "",
+ ... "dr": "",
+ ... "l": "",
+ ... "t": ""
+ ... },
+ ... }
+ """
+
+ def __init__(
+ self,
+ env: gym.Env,
+ gamma: float = 1.0,
+ buffer_length: int = 100,
+ stats_key: str = "episode",
+ ):
+ """This wrapper will keep track of cumulative rewards and episode lengths.
+
+ Args:
+ env (Env): The environment to apply the wrapper
+ gamma (float): Discounting factor
+ buffer_length: The size of the buffers :attr:`return_queue`, :attr:`length_queue` and :attr:`time_queue`
+ stats_key: The info key for the episode statistics
+ """
+ gym.utils.RecordConstructorArgs.__init__(self, gamma=gamma, buffer_length=buffer_length, stats_key=stats_key)
+ RecordEpisodeStatistics.__init__(self, env, buffer_length=buffer_length, stats_key=stats_key)
+ # CHANGE: Here we just override the standard implementation to extend to MO
+ self.reward_dim = self.env.unwrapped.reward_space.shape[0]
+ self.rewards_shape = (self.reward_dim,)
+ self.gamma = gamma
+
+ def step(self, action):
+ """Steps through the environment, recording the episode statistics."""
+ # This is very close the code from the RecordEpisodeStatistics wrapper from Gymnasium.
+ (
+ observation,
+ rewards,
+ terminated,
+ truncated,
+ info,
+ ) = self.env.step(action)
+ assert isinstance(
+ info, dict
+ ), f"`info` dtype is {type(info)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order."
+ self.episode_returns += rewards
+
+ # CHANGE: The discounted returns are also computed here
+ self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape(
+ self.episode_returns.shape
+ )
+ self.episode_lengths += 1
+
+ if terminated or truncated:
+ assert self._stats_key not in info
+
+ episode_time_length = round(time.perf_counter() - self.episode_start_time, 6)
+
+ # Make a deepcopy to void subsequent mutation of the numpy array
+ episode_returns = deepcopy(self.episode_returns)
+ disc_episode_returns = deepcopy(self.disc_episode_returns)
+
+ info["episode"] = {
+ "r": episode_returns,
+ "dr": disc_episode_returns,
+ "l": self.episode_lengths,
+ "t": episode_time_length,
+ }
+
+ self.time_queue.append(episode_time_length)
+ self.return_queue.append(episode_returns)
+ self.length_queue.append(self.episode_lengths)
+
+ self.episode_count += 1
+ self.episode_start_time = time.perf_counter()
+
+ return (
+ observation,
+ rewards,
+ terminated,
+ truncated,
+ info,
+ )
+
+ def reset(self, **kwargs):
+ """Resets the environment using kwargs and resets the episode returns and lengths."""
+ obs, info = super().reset(**kwargs)
+
+ # CHANGE: Here we just override the standard implementation to extend to MO
+ self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32)
+ self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32)
+
+ return obs, info
+
+
+class MOMaxAndSkipObservation(gym.Wrapper):
+ """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last observations.
+
+ Note: This wrapper is based on the wrapper from stable-baselines3: https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html#MaxAndSkipEnv
+ """
+
+ def __init__(self, env: gym.Env[ObsType, ActType], skip: int = 4):
+ """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last frames.
+
+ Args:
+ env (Env): The environment to apply the wrapper
+ skip: The number of frames to skip
+ """
+ gym.Wrapper.__init__(self, env)
+
+ if not np.issubdtype(type(skip), np.integer):
+ raise TypeError(f"The skip is expected to be an integer, actual type: {type(skip)}")
+ if skip < 2:
+ raise ValueError(f"The skip value needs to be equal or greater than two, actual value: {skip}")
+ if env.observation_space.shape is None:
+ raise ValueError("The observation space must have the shape attribute.")
+
+ self._skip = skip
+ self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype)
+
+ def step(self, action):
+ """Step the environment with the given action for ``skip`` steps.
+
+ Repeat action, sum reward, and max over last observations.
+
+ Args:
+ action: The action to step through the environment with
+ Returns:
+ Max of the last two observations, reward, terminated, truncated, and info from the environment
+ """
+ total_reward = np.zeros(self.env.unwrapped.reward_dim, dtype=np.float32)
+ terminated = truncated = False
+ info = {}
+ for i in range(self._skip):
+ obs, reward, terminated, truncated, info = self.env.step(action)
+ done = terminated or truncated
+ if i == self._skip - 2:
+ self._obs_buffer[0] = obs
+ if i == self._skip - 1:
+ self._obs_buffer[1] = obs
+ total_reward += reward
+ if done:
+ break
+ max_frame = self._obs_buffer.max(axis=0)
+
+ return max_frame, total_reward, terminated, truncated, info
diff --git a/pyproject.toml b/pyproject.toml
index 160f2b53..d4b42d55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ readme = "README.md"
requires-python = ">= 3.8"
authors = [{ name = "Farama Foundation", email = "contact@farama.org" }]
license = { text = "MIT License" }
-keywords = ["Reinforcement Learning", "Multi-Objective", "RL", "AI", "gymnasium"]
+keywords = ["Reinforcement Learning", "Multi-Objective", "RL", "AI", "Gymnasium"]
classifiers = [
"Development Status :: 4 - Beta", # change to `5 - Production/Stable` when ready
"License :: OSI Approved :: MIT License",
@@ -22,8 +22,8 @@ classifiers = [
'Topic :: Scientific/Engineering :: Artificial Intelligence',
]
dependencies = [
- "gymnasium>=0.28.1,<0.30",
- "numpy >=1.21.0",
+ "gymnasium >=1.0.0",
+ "numpy >=1.21.0,<2.0",
"pygame >=2.1.0",
"scipy >=1.7.3",
"pymoo >=0.6.0",
@@ -49,7 +49,7 @@ all = [
"imageio >=2.14.1",
"mujoco >=2.2.0",
# highway
- "highway-env >= 1.8",
+ "highway-env >= 1.9.1",
# box2d
"box2d-py ==2.3.5",
"pygame ==2.1.3.dev8",
@@ -73,12 +73,12 @@ include = ["mo_gymnasium", "mo_gymnasium.*"]
mo_gymnasium = [
"**/*.json",
"**/assets/*",
+ "py.typed"
]
# Linters and Test tools #######################################################
[tool.black]
-safe = true
line-length = 127
target-version = ['py38', 'py39', 'py310', 'py311']
include = '\.pyi?$'
diff --git a/tests/test_envs.py b/tests/test_envs.py
index 28af4b0c..568cc6b6 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -14,6 +14,7 @@
for env_spec in gym.envs.registry.values():
if type(env_spec.entry_point) is not str:
continue
+
# collect MO Gymnasium envs
if env_spec.entry_point.split(".")[0] == "mo_gymnasium":
all_testing_env_specs.append(env_spec)
@@ -27,7 +28,7 @@
def test_all_env_api(spec):
"""Check that all environments pass the environment checker."""
env = mo_gym.make(spec.id)
- env = mo_gym.LinearReward(env)
+ env = mo_gym.wrappers.LinearReward(env)
check_env(env, skip_render_check=True)
_test_reward_bounds(env.unwrapped)
_test_pickle_env(env)
@@ -46,7 +47,7 @@ def test_all_env_passive_env_checker(spec):
[
("MountainCar-v0", "mo-mountaincar-v0"),
("MountainCarContinuous-v0", "mo-mountaincarcontinuous-v0"),
- ("LunarLander-v2", "mo-lunar-lander-v2"),
+ ("LunarLander-v3", "mo-lunar-lander-v3"),
# ("Reacher-v4", "mo-reacher-v4"), # use a different model and action space
("Hopper-v4", "mo-hopper-v4"),
("HalfCheetah-v4", "mo-halfcheetah-v4"),
@@ -58,7 +59,7 @@ def test_all_env_passive_env_checker(spec):
)
def test_gymnasium_equivalence(gym_id, mo_gym_id, num_steps=100, seed=123):
env = gym.make(gym_id)
- mo_env = mo_gym.LinearReward(mo_gym.make(mo_gym_id))
+ mo_env = mo_gym.wrappers.LinearReward(mo_gym.make(mo_gym_id))
# for float rewards, then precision becomes an issue
env = gym.wrappers.TransformReward(env, lambda reward: round(reward, 4))
@@ -93,8 +94,8 @@ def test_env_determinism_rollout(env_spec: EnvSpec):
env_1 = mo_gym.make(env_spec.id)
env_2 = mo_gym.make(env_spec.id)
- env_1 = mo_gym.LinearReward(env_1)
- env_2 = mo_gym.LinearReward(env_2)
+ env_1 = mo_gym.wrappers.LinearReward(env_1)
+ env_2 = mo_gym.wrappers.LinearReward(env_2)
initial_obs_1, initial_info_1 = env_1.reset(seed=SEED)
initial_obs_2, initial_info_2 = env_2.reset(seed=SEED)
@@ -156,7 +157,7 @@ def assert_equals(a, b, prefix=None):
b: second data structure
prefix: prefix for failed assertion message for types and dicts
"""
- assert type(a) == type(b), f"{prefix}Differing types: {a} and {b}"
+ assert type(a) is type(b), f"{prefix}Differing types: {a} and {b}"
if isinstance(a, dict):
assert list(a.keys()) == list(b.keys()), f"{prefix}Key sets differ: {a} and {b}"
@@ -190,7 +191,7 @@ def test_ccs_dst():
np.array([19.778, -17.383]),
]
- discounted_front = env.pareto_front(gamma=0.99)
+ discounted_front = env.unwrapped.pareto_front(gamma=0.99)
for desired, actual in zip(known_ccs, discounted_front):
np.testing.assert_array_almost_equal(desired, actual, decimal=2)
@@ -200,7 +201,7 @@ def test_ccs_dst_no_discount():
known_ccs = mo_gym.envs.deep_sea_treasure.deep_sea_treasure.CONVEX_FRONT
- discounted_front = env.pareto_front(gamma=1.0)
+ discounted_front = env.unwrapped.pareto_front(gamma=1.0)
for desired, actual in zip(known_ccs, discounted_front):
np.testing.assert_array_almost_equal(desired, actual, decimal=2)
@@ -223,7 +224,7 @@ def test_concave_pf_dst():
np.array([124.0 * gamma**18, -17.383]),
]
- discounted_front = env.pareto_front(gamma=0.99)
+ discounted_front = env.unwrapped.pareto_front(gamma=0.99)
for desired, actual in zip(known_pf, discounted_front):
np.testing.assert_array_almost_equal(desired, actual, decimal=2)
@@ -233,7 +234,7 @@ def test_concave_pf_dst_no_discount():
known_pf = mo_gym.envs.deep_sea_treasure.deep_sea_treasure.CONCAVE_FRONT
- discounted_front = env.pareto_front(gamma=1.0)
+ discounted_front = env.unwrapped.pareto_front(gamma=1.0)
for desired, actual in zip(known_pf, discounted_front):
np.testing.assert_array_almost_equal(desired, actual, decimal=2)
@@ -244,7 +245,7 @@ def test_pf_fruit_tree():
known_pf = np.array(mo_gym.envs.fruit_tree.fruit_tree.FRUITS[str(depth)]) * (0.99 ** (depth - 1))
- discounted_front = env.pareto_front(gamma=0.99)
+ discounted_front = env.unwrapped.pareto_front(gamma=0.99)
for desired, actual in zip(known_pf, discounted_front):
np.testing.assert_array_almost_equal(desired, actual, decimal=2)
@@ -255,6 +256,6 @@ def test_pf_fruit_tree_no_discount():
known_pf = mo_gym.envs.fruit_tree.fruit_tree.FRUITS[str(depth)]
- discounted_front = env.pareto_front(gamma=1.0)
+ discounted_front = env.unwrapped.pareto_front(gamma=1.0)
for desired, actual in zip(known_pf, discounted_front):
np.testing.assert_array_almost_equal(desired, actual, decimal=2)
diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py
new file mode 100644
index 00000000..d57d7567
--- /dev/null
+++ b/tests/test_vector_wrappers.py
@@ -0,0 +1,89 @@
+import gymnasium as gym
+import numpy as np
+
+import mo_gymnasium as mo_gym
+from mo_gymnasium.wrappers.vector import MORecordEpisodeStatistics, MOSyncVectorEnv
+
+
+def test_mo_sync_wrapper():
+ num_envs = 3
+ envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)])
+
+ envs.reset()
+ obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample())
+ assert len(obs) == num_envs, "Number of observations do not match the number of envs"
+ assert len(rewards) == num_envs, "Number of rewards do not match the number of envs"
+ assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs"
+ assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs"
+ envs.close()
+
+
+def test_mo_sync_autoreset():
+ num_envs = 2
+ envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)])
+
+ obs, infos = envs.reset()
+ assert (obs[0] == [0, 0]).all()
+ assert (obs[1] == [0, 0]).all()
+ obs, rewards, terminateds, truncateds, infos = envs.step([0, 1])
+ assert (obs[0] == [0, 0]).all()
+ assert (obs[1] == [1, 0]).all()
+ # Use np assert almost equal to avoid floating point errors
+ np.testing.assert_almost_equal(rewards[0], np.array([0.0, -1.0], dtype=np.float32), decimal=2)
+ np.testing.assert_almost_equal(rewards[1], np.array([0.7, -1.0], dtype=np.float32), decimal=2)
+ assert not terminateds[0]
+ assert terminateds[1] # This one is done
+ assert not truncateds[0]
+ assert not truncateds[1]
+ obs, rewards, terminateds, truncateds, infos = envs.step([0, 1])
+ assert (obs[0] == [0, 0]).all()
+ assert (obs[1] == [0, 0]).all()
+ assert (rewards[0] == [0.0, -1.0]).all()
+ assert (rewards[1] == [0.0, 0.0]).all() # Reset step
+ assert not terminateds[0]
+ assert not terminateds[1] # Not done anymore
+ envs.close()
+
+
+def test_mo_record_ep_statistic_vector_env():
+ num_envs = 2
+ envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)])
+ envs = MORecordEpisodeStatistics(envs, gamma=0.97)
+
+ envs.reset()
+ terminateds = np.array([False] * num_envs)
+ info = {}
+ obs, rewards, terminateds, _, info = envs.step([0, 3])
+ obs, rewards, terminateds, _, info = envs.step([0, 1])
+ obs, rewards, terminateds, _, info = envs.step([0, 1])
+
+ assert isinstance(info["episode"]["r"], np.ndarray)
+ assert isinstance(info["episode"]["dr"], np.ndarray)
+ # Episode records are vectorized because multiple environments
+ assert info["episode"]["r"].shape == (num_envs, 2)
+ np.testing.assert_almost_equal(info["episode"]["r"][0], np.array([0.0, 0.0], dtype=np.float32), decimal=2)
+ np.testing.assert_almost_equal(info["episode"]["r"][1], np.array([8.2, -3.0], dtype=np.float32), decimal=2)
+ assert info["episode"]["dr"].shape == (num_envs, 2)
+ np.testing.assert_almost_equal(info["episode"]["dr"][0], np.array([0.0, 0.0], dtype=np.float32), decimal=2)
+ np.testing.assert_almost_equal(info["episode"]["dr"][1], np.array([7.72, -2.91], dtype=np.float32), decimal=2)
+ assert isinstance(info["episode"]["l"], np.ndarray)
+ np.testing.assert_almost_equal(info["episode"]["l"], np.array([0, 3], dtype=np.float32), decimal=2)
+ assert isinstance(info["episode"]["t"], np.ndarray)
+ envs.close()
+
+
+def test_gym_wrapper_and_vector():
+ # This tests the integration of gym-wrapped envs with MO-Gymnasium vectorized envs
+ num_envs = 2
+ envs = MOSyncVectorEnv(
+ [lambda: gym.wrappers.NormalizeObservation(mo_gym.make("deep-sea-treasure-v0")) for _ in range(num_envs)]
+ )
+
+ envs.reset()
+ for i in range(30):
+ obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample())
+ assert len(obs) == num_envs, "Number of observations do not match the number of envs"
+ assert len(rewards) == num_envs, "Number of rewards do not match the number of envs"
+ assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs"
+ assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs"
+ envs.close()
diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py
index 9cf42354..df2ded4a 100644
--- a/tests/test_wrappers.py
+++ b/tests/test_wrappers.py
@@ -1,11 +1,10 @@
import numpy as np
import mo_gymnasium as mo_gym
-from mo_gymnasium import (
+from mo_gymnasium.wrappers import (
MOClipReward,
MONormalizeReward,
MORecordEpisodeStatistics,
- MOSyncVectorEnv,
)
@@ -14,35 +13,42 @@ def go_to_8_3(env):
Goes to (8.2, -3) treasure, returns the rewards
"""
env.reset()
- env.step(3) # right
- env.step(1) # down
- _, rewards, _, _, infos = env.step(1)
+ env.step(3) # action: right, rewards: [0, -1]
+ env.step(1) # action: down, rewards: [0, -1]
+ _, rewards, _, _, infos = env.step(1) # action: down, rewards: [8.2, -1]
return rewards, infos
def test_normalization_wrapper():
+ # Watch out that the wrapper does not normalize the rewards to have a mean of 0 and std of 1
+ # instead it smoothens the moving average of the rewards
env = mo_gym.make("deep-sea-treasure-v0")
norm_treasure_env = MONormalizeReward(env, idx=0)
both_norm_env = MONormalizeReward(norm_treasure_env, idx=1)
+ # No normalization
+ env.reset(seed=0)
+ _, rewards, _, _, _ = env.step(1)
+ np.testing.assert_almost_equal(rewards, [0.7, -1.0], decimal=2)
+
# Tests for both rewards normalized
for i in range(30):
go_to_8_3(both_norm_env)
- both_norm_env.reset()
+ both_norm_env.reset(seed=0)
_, rewards, _, _, _ = both_norm_env.step(1) # down
- np.testing.assert_allclose(rewards, [0.18, -1.24], rtol=0, atol=1e-2)
+ np.testing.assert_almost_equal(rewards, [0.5, -1.24], decimal=2)
rewards, _ = go_to_8_3(both_norm_env)
- np.testing.assert_allclose(rewards, [2.13, -1.24], rtol=0, atol=1e-2)
+ np.testing.assert_almost_equal(rewards, [4.73, -1.24], decimal=2)
# Tests for only treasure normalized
for i in range(30):
go_to_8_3(norm_treasure_env)
- norm_treasure_env.reset()
+ norm_treasure_env.reset(seed=0)
_, rewards, _, _, _ = norm_treasure_env.step(1) # down
# Time rewards are not normalized (-1)
- np.testing.assert_allclose(rewards, [0.18, -1.0], rtol=0, atol=1e-2)
+ np.testing.assert_almost_equal(rewards, [0.51, -1.0], decimal=2)
rewards, _ = go_to_8_3(norm_treasure_env)
- np.testing.assert_allclose(rewards, [2.13, -1.0], rtol=0, atol=1e-2)
+ np.testing.assert_almost_equal(rewards, [5.33, -1.0], decimal=2)
def test_clip_wrapper():
@@ -66,26 +72,6 @@ def test_clip_wrapper():
np.testing.assert_allclose(rewards, [0.5, -1.0], rtol=0, atol=1e-2)
-def test_mo_sync_wrapper():
- def make_env(env_id):
- def thunk():
- env = mo_gym.make(env_id)
- env = MORecordEpisodeStatistics(env, gamma=0.97)
- return env
-
- return thunk
-
- num_envs = 3
- envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)])
-
- envs.reset()
- obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample())
- assert len(obs) == num_envs, "Number of observations do not match the number of envs"
- assert len(rewards) == num_envs, "Number of rewards do not match the number of envs"
- assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs"
- assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs"
-
-
def test_mo_record_ep_statistic():
env = mo_gym.make("deep-sea-treasure-v0")
env = MORecordEpisodeStatistics(env, gamma=0.97)
@@ -98,37 +84,9 @@ def test_mo_record_ep_statistic():
assert info["episode"]["r"].shape == (2,)
assert info["episode"]["dr"].shape == (2,)
assert tuple(info["episode"]["r"]) == (np.float32(8.2), np.float32(-3.0))
- assert tuple(np.round(info["episode"]["dr"], 2)) == (
- np.float32(7.48),
- np.float32(-2.82),
- )
- assert isinstance(info["episode"]["l"], np.int32)
+ np.testing.assert_allclose(info["episode"]["dr"], [7.71538, -2.9109], rtol=0, atol=1e-2)
+ # 0 * 0.97**0 + 0 * 0.97**1 + 8.2 * 0.97**2 == 7.71538
+ # -1 * 0.97**0 + -1 * 0.97**1 + -1 * 0.97**2 == -2.9109
+ assert isinstance(info["episode"]["l"], int)
assert info["episode"]["l"] == 3
- assert isinstance(info["episode"]["t"], np.float32)
-
-
-def test_mo_record_ep_statistic_vector_env():
- def make_env(env_id):
- def thunk():
- env = mo_gym.make(env_id)
- return env
-
- return thunk
-
- num_envs = 3
- envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)])
- envs = MORecordEpisodeStatistics(envs)
-
- envs.reset()
- terminateds = np.array([False] * num_envs)
- info = {}
- while not np.any(terminateds):
- obs, rewards, terminateds, _, info = envs.step(envs.action_space.sample())
-
- assert isinstance(info["episode"]["r"], np.ndarray)
- assert isinstance(info["episode"]["dr"], np.ndarray)
- # Episode records are vectorized because multiple environments
- assert info["episode"]["r"].shape == (num_envs, 2)
- assert info["episode"]["dr"].shape == (num_envs, 2)
- assert isinstance(info["episode"]["l"], np.ndarray)
- assert isinstance(info["episode"]["t"], np.ndarray)
+ assert isinstance(info["episode"]["t"], float)