diff --git a/.github/workflows/build-publish.yml b/.github/workflows/build-publish.yml index d44dc96d..a00c7e8e 100644 --- a/.github/workflows/build-publish.yml +++ b/.github/workflows/build-publish.yml @@ -4,7 +4,7 @@ # - https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ # # derived from https://github.com/Farama-Foundation/PettingZoo/blob/e230f4d80a5df3baf9bd905149f6d4e8ce22be31/.github/workflows/build-publish.yml -name: build-publish +name: Build artifact for PyPI on: push: @@ -16,35 +16,18 @@ on: jobs: build-wheels: - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: ubuntu-latest - python: 38 - platform: manylinux_x86_64 - - os: ubuntu-latest - python: 39 - platform: manylinux_x86_64 - - os: ubuntu-latest - python: 310 - platform: manylinux_x86_64 - - os: ubuntu-latest - python: 311 - platform: manylinux_x86_64 + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - name: Install dependencies - run: python -m pip install --upgrade pip setuptools build + run: pipx install build - name: Build sdist and wheels - run: python -m build + run: pyproject-build - name: Store wheels - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: path: dist @@ -55,10 +38,11 @@ jobs: if: github.event_name == 'release' && github.event.action == 'published' steps: - name: Download dists - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: artifact path: dist + - name: Publish uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 80ce02af..9f2cc2ab 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -13,9 +13,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - - run: python -m pip install pre-commit - - run: python -m pre_commit --version - - run: python -m pre_commit install - - run: python -m pre_commit run --all-files + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - run: pipx install pre-commit + - run: pre-commit run --all-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cbbea960..05e72fd0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v5.0.0 hooks: - id: check-symlinks - id: destroyed-symlinks @@ -17,13 +17,13 @@ repos: - id: detect-private-key - id: debug-statements - repo: https://github.com/codespell-project/codespell - rev: v2.2.4 + rev: v2.3.0 hooks: - id: codespell args: - --ignore-words-list=reacher, mor - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.1.1 hooks: - id: flake8 args: @@ -34,16 +34,16 @@ repos: - --show-source - --statistics - repo: https://github.com/asottile/pyupgrade - rev: v3.3.1 + rev: v3.18.0 hooks: - id: pyupgrade args: ["--py37-plus"] - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort - repo: https://github.com/python/black - rev: 23.1.0 + rev: 24.10.0 hooks: - id: black - repo: https://github.com/pycqa/pydocstyle diff --git a/README.md b/README.md index fb5f7885..708bb3a6 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ obs, info = env.reset() next_obs, vector_reward, terminated, truncated, info = env.step(your_agent.act(obs)) # Optionally, you can scalarize the reward function with the LinearReward wrapper -env = mo_gym.LinearReward(env, weight=np.array([0.8, 0.2, 0.2])) +env = mo_gym.wrappers.LinearReward(env, weight=np.array([0.8, 0.2, 0.2])) ``` For details on multi-objective MDP's (MOMDP's) and other MORL definitions, see [A practical guide to multi-objective reinforcement learning and planning](https://link.springer.com/article/10.1007/s10458-022-09552-y). diff --git a/docs/_scripts/gen_env_docs.py b/docs/_scripts/gen_env_docs.py index eec55219..87a2184e 100644 --- a/docs/_scripts/gen_env_docs.py +++ b/docs/_scripts/gen_env_docs.py @@ -41,7 +41,7 @@ def trim(docstring): pattern = re.compile(r"(? | Continuous / Discrete | `[time_penalty, reverse_penalty, forward_penalty]` | Classic Mountain Car env, but with extra penalties for the forward and reverse actions. From [Vamplew et al. 2011](https://www.researchgate.net/publication/220343783_Empirical_evaluation_methods_for_multiobjective_reinforcement_learning_algorithms). | -| [`mo-mountaincarcontinuous-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincarcontinuous/)
| Continuous / Continuous | `[time_penalty, fuel_consumption_penalty]` | Continuous Mountain Car env, but with penalties for fuel consumption. | +[`mo-mountaincar-3d-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/) **
| Continuous / Discrete| `[time_penalty, move_penalty, speed_objective]` | The forward and backward penalties have been merged into the move penalty and a speed objective has been introduced which gives the positive reward equivalent to the car's speed at that time step.* | +[`mo-mountaincar-timemove-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/) **
| Continuous / Discrete | `[time_penalty, move_penalty]`| Class Mountain Car env but an extra penalty for moving backwards or forwards merged into a move penalty. | +[`mo-mountaincar-timespeed-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/) **
| Continuous / Discrete| `[time_penalty, speed_objective]` | Class Mountain Car env but an extra positive objective of speed which gives the positive reward equivalent to the car's speed at that time step.* +| [`mo-mountaincarcontinuous-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincarcontinuous/)
| Continuous / Continuous | `[time_penalty, fuel_consumption_penalty]` | Continuous Mountain Car env, but with penalties for fuel consumption. | | [`mo-lunar-lander-v2`](https://mo-gymnasium.farama.org/environments/mo-lunar-lander/)
| Continuous / Discrete or Continuous | `[landed, shaped_reward, main_engine_fuel, side_engine_fuel]` | MO version of the `LunarLander-v2` [environment](https://gymnasium.farama.org/environments/box2d/lunar_lander/). Objectives defined similarly as in [Hung et al. 2022](https://openreview.net/forum?id=AwWaBXLIJE). | +*An additional objective was introduced to prevent the agent from converging to the local maxima due to a lack of reward signal for the static action. + +**Read more about these environments and the detailed reasoning behind them in [`Pranav Gupta's Dissertation`](https://drive.google.com/file/d/1yT6hlavYZGmoB2phaIBX_5hbibA3Illa/view?usp=sharing) + + ```{toctree} :hidden: :glob: diff --git a/docs/index.md b/docs/index.md index fb6d56ff..f1d24905 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,6 +11,7 @@ lastpage: introduction/install introduction/api wrappers/wrappers +wrappers/vector_wrappers examples/morl_baselines ``` diff --git a/docs/wrappers/vector_wrappers.md b/docs/wrappers/vector_wrappers.md new file mode 100644 index 00000000..ade24022 --- /dev/null +++ b/docs/wrappers/vector_wrappers.md @@ -0,0 +1,20 @@ +--- +title: "Vector Wrappers" +--- + +# Vector Wrappers + +Similar to the normal wrappers, MO-Gymnasium provides a few wrappers that are specifically designed to work with vectorized environments. They are all available directly from the `mo_gymnasium.wrappers.vector` module. + + +## `MOSyncVectorEnv` + +```{eval-rst} +.. autoclass:: mo_gymnasium.wrappers.vector.MOSyncVectorEnv +``` + +## `MORecordEpisodeStatistics` + +```{eval-rst} +.. autoclass:: mo_gymnasium.wrappers.vector.MORecordEpisodeStatistics +``` diff --git a/docs/wrappers/wrappers.md b/docs/wrappers/wrappers.md index 542e5cca..acf2ab56 100644 --- a/docs/wrappers/wrappers.md +++ b/docs/wrappers/wrappers.md @@ -4,36 +4,36 @@ title: "Wrappers" # Wrappers -A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium` module. +A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium.wrappers` module. ## `LinearReward` ```{eval-rst} -.. autoclass:: mo_gymnasium.LinearReward +.. autoclass:: mo_gymnasium.wrappers.LinearReward ``` ## `MONormalizeReward` ```{eval-rst} -.. autoclass:: mo_gymnasium.MONormalizeReward +.. autoclass:: mo_gymnasium.wrappers.MONormalizeReward ``` ## `MOClipReward` ```{eval-rst} -.. autoclass:: mo_gymnasium.MOClipReward +.. autoclass:: mo_gymnasium.wrappers.MOClipReward ``` -## `MOSyncVectorEnv` +## `MORecordEpisodeStatistics` ```{eval-rst} -.. autoclass:: mo_gymnasium.MOSyncVectorEnv +.. autoclass:: mo_gymnasium.wrappers.MORecordEpisodeStatistics ``` -## `MORecordEpisodeStatistics` +## `MOMaxAndSkipObservation` ```{eval-rst} -.. autoclass:: mo_gymnasium.MORecordEpisodeStatistics +.. autoclass:: mo_gymnasium.wrappers.MOMaxAndSkipObservation ``` diff --git a/mo_gymnasium/__init__.py b/mo_gymnasium/__init__.py index 23201d0c..94fe1c92 100644 --- a/mo_gymnasium/__init__.py +++ b/mo_gymnasium/__init__.py @@ -2,16 +2,10 @@ # Envs import mo_gymnasium.envs +from mo_gymnasium import wrappers # Utils -from mo_gymnasium.utils import ( - LinearReward, - MOClipReward, - MONormalizeReward, - MORecordEpisodeStatistics, - MOSyncVectorEnv, - make, -) +from mo_gymnasium.utils import make -__version__ = "1.1.0" +__version__ = "1.2.0" diff --git a/mo_gymnasium/envs/__init__.py b/mo_gymnasium/envs/__init__.py index 7e917397..c4846df6 100644 --- a/mo_gymnasium/envs/__init__.py +++ b/mo_gymnasium/envs/__init__.py @@ -10,6 +10,5 @@ import mo_gymnasium.envs.minecart import mo_gymnasium.envs.mountain_car import mo_gymnasium.envs.mujoco -import mo_gymnasium.envs.reacher import mo_gymnasium.envs.resource_gathering import mo_gymnasium.envs.water_reservoir diff --git a/mo_gymnasium/envs/fishwood/fishwood.py b/mo_gymnasium/envs/fishwood/fishwood.py index fad02d77..7aa1242b 100644 --- a/mo_gymnasium/envs/fishwood/fishwood.py +++ b/mo_gymnasium/envs/fishwood/fishwood.py @@ -42,8 +42,8 @@ class FishWood(gym.Env, EzPickle): """ metadata = {"render_modes": ["human"]} - FISH = 0 - WOOD = 1 + FISH = np.array([0], dtype=np.int32) + WOOD = np.array([1], dtype=np.int32) MAX_TS = 200 def __init__(self, render_mode: Optional[str] = None, fishproba=0.1, woodproba=0.9): @@ -55,17 +55,17 @@ def __init__(self, render_mode: Optional[str] = None, fishproba=0.1, woodproba=0 self.action_space = spaces.Discrete(2) # 2 actions, go fish and go wood # 2 states, fishing and in the woods - self.observation_space = spaces.Discrete(2) + self.observation_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.int32) # 2 objectives, amount of fish and amount of wood self.reward_space = spaces.Box(low=np.array([0, 0]), high=np.array([1.0, 1.0]), dtype=np.float32) self.reward_dim = 2 - self._state = self.WOOD + self._state = self.WOOD.copy() def reset(self, seed=None, **kwargs): super().reset(seed=seed) - self._state = self.WOOD + self._state = self.WOOD.copy() self._timestep = 0 if self.render_mode == "human": self.render() @@ -89,7 +89,7 @@ def step(self, action): rewards[self.FISH] = 1.0 # Execute the action - self._state = action + self._state = np.array([action], dtype=np.int32) self._timestep += 1 if self.render_mode == "human": diff --git a/mo_gymnasium/envs/lunar_lander/__init__.py b/mo_gymnasium/envs/lunar_lander/__init__.py index d4435341..817671fb 100644 --- a/mo_gymnasium/envs/lunar_lander/__init__.py +++ b/mo_gymnasium/envs/lunar_lander/__init__.py @@ -2,13 +2,13 @@ register( - id="mo-lunar-lander-v2", + id="mo-lunar-lander-v3", entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander", max_episode_steps=1000, ) register( - id="mo-lunar-lander-continuous-v2", + id="mo-lunar-lander-continuous-v3", entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander", max_episode_steps=1000, kwargs={"continuous": True}, diff --git a/mo_gymnasium/envs/mario/joypad_space.py b/mo_gymnasium/envs/mario/joypad_space.py index 73969eee..32fb0dfb 100644 --- a/mo_gymnasium/envs/mario/joypad_space.py +++ b/mo_gymnasium/envs/mario/joypad_space.py @@ -1,4 +1,5 @@ """An environment wrapper to convert binary to discrete action space. This is a modified version of the original code from nes-py.""" + from typing import List import gymnasium as gym diff --git a/mo_gymnasium/envs/mario/mario.py b/mo_gymnasium/envs/mario/mario.py index b7279941..45924ed3 100644 --- a/mo_gymnasium/envs/mario/mario.py +++ b/mo_gymnasium/envs/mario/mario.py @@ -7,7 +7,6 @@ from gymnasium.utils import EzPickle, seeding # from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv -from gymnasium.wrappers import GrayScaleObservation, ResizeObservation from nes_py.nes_env import SCREEN_SHAPE_24_BIT import mo_gymnasium as mo_gym @@ -16,7 +15,7 @@ from mo_gymnasium.envs.mario.joypad_space import JoypadSpace -class MOSuperMarioBros(SuperMarioBrosEnv, EzPickle): +class MOSuperMarioBros(SuperMarioBrosEnv, gym.Env, EzPickle): """ ## Description Multi-objective version of the SuperMarioBro environment. @@ -202,11 +201,14 @@ def step(self, action): if __name__ == "__main__": + from gymnasium.wrappers import ResizeObservation + from gymnasium.wrappers.transform_observation import GrayscaleObservation + env = MOSuperMarioBros() env = JoypadSpace(env, SIMPLE_MOVEMENT) # env = MaxAndSkipEnv(env, 4) env = ResizeObservation(env, (84, 84)) - env = GrayScaleObservation(env) + env = GrayscaleObservation(env) # env = FrameStack(env, 4) env = mo_gym.LinearReward(env) diff --git a/mo_gymnasium/envs/minecart/minecart.py b/mo_gymnasium/envs/minecart/minecart.py index b5154192..cd6c64e7 100644 --- a/mo_gymnasium/envs/minecart/minecart.py +++ b/mo_gymnasium/envs/minecart/minecart.py @@ -249,9 +249,11 @@ def pareto_front(self, gamma: float, symmetric: bool = True) -> List[np.ndarray] queue = [ { "speed": ACCELERATION * self.frame_skip, - "dist": mine_distance - self.frame_skip * (self.frame_skip + 1) / 2 * ACCELERATION - if self.incremental_frame_skip - else mine_distance - ACCELERATION * self.frame_skip * self.frame_skip, + "dist": ( + mine_distance - self.frame_skip * (self.frame_skip + 1) / 2 * ACCELERATION + if self.incremental_frame_skip + else mine_distance - ACCELERATION * self.frame_skip * self.frame_skip + ), "seq": [ACT_ACCEL], } ] diff --git a/mo_gymnasium/envs/mountain_car/__init__.py b/mo_gymnasium/envs/mountain_car/__init__.py index f75fe751..523de281 100644 --- a/mo_gymnasium/envs/mountain_car/__init__.py +++ b/mo_gymnasium/envs/mountain_car/__init__.py @@ -6,3 +6,24 @@ entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", max_episode_steps=200, ) + +register( + id="mo-mountaincar-3d-v0", + entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", + max_episode_steps=200, + kwargs={"add_speed_objective": True, "merge_move_penalty": True}, +) + +register( + id="mo-mountaincar-timemove-v0", + entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", + max_episode_steps=200, + kwargs={"merge_move_penalty": True}, +) + +register( + id="mo-mountaincar-timespeed-v0", + entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar", + max_episode_steps=200, + kwargs={"remove_move_penalty": True, "add_speed_objective": True}, +) diff --git a/mo_gymnasium/envs/mountain_car/mountain_car.py b/mo_gymnasium/envs/mountain_car/mountain_car.py index 6e88acca..f49cf5ce 100644 --- a/mo_gymnasium/envs/mountain_car/mountain_car.py +++ b/mo_gymnasium/envs/mountain_car/mountain_car.py @@ -14,19 +14,50 @@ class MOMountainCar(MountainCarEnv, EzPickle): See [Gymnasium's env](https://gymnasium.farama.org/environments/classic_control/mountain_car_continuous/) for more information. ## Reward space: - The reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward. + By default, the reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward. - time penalty: -1.0 for each time step - reverse penalty: -1.0 for each time step the action is 0 (reverse) - forward penalty: -1.0 for each time step the action is 2 (forward) + + #Alternatively, the reward can be changed with the following options: + - add_speed_objective: Add an extra objective corresponding to the speed of the car. + - remove_move_penalty: Remove the reverse and forward objectives. + - merge_move_penalty: Merge reverse and forward penalties into a single penalty. """ - def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): + def __init__( + self, + render_mode: Optional[str] = None, + add_speed_objective: bool = False, + remove_move_penalty: bool = False, + merge_move_penalty: bool = False, + goal_velocity=0, + ): super().__init__(render_mode, goal_velocity) - EzPickle.__init__(self, render_mode, goal_velocity) + EzPickle.__init__(self, render_mode, add_speed_objective, remove_move_penalty, merge_move_penalty, goal_velocity) + self.add_speed_objective = add_speed_objective + self.remove_move_penalty = remove_move_penalty + self.merge_move_penalty = merge_move_penalty - self.reward_space = spaces.Box(low=np.array([-1, -1, -1]), high=np.array([-1, 0, 0]), shape=(3,), dtype=np.float32) self.reward_dim = 3 + if self.add_speed_objective: + self.reward_dim += 1 + + if self.remove_move_penalty: + self.reward_dim -= 2 + elif self.merge_move_penalty: + self.reward_dim -= 1 + + low = np.array([-1] * self.reward_dim) + high = np.zeros(self.reward_dim) + high[0] = -1 # Time penalty is always -1 + if self.add_speed_objective: + low[-1] = 0.0 + high[-1] = 1.1 + + self.reward_space = spaces.Box(low=low, high=high, shape=(self.reward_dim,), dtype=np.float32) + def step(self, action: int): assert self.action_space.contains(action), f"{action!r} ({type(action)}) invalid" @@ -39,11 +70,20 @@ def step(self, action: int): velocity = 0 terminated = bool(position >= self.goal_position and velocity >= self.goal_velocity) - # reward = -1.0 - reward = np.zeros(3, dtype=np.float32) + + reward = np.zeros(self.reward_dim, dtype=np.float32) + reward[0] = 0.0 if terminated else -1.0 # time penalty - reward[1] = 0.0 if action != 0 else -1.0 # reverse penalty - reward[2] = 0.0 if action != 2 else -1.0 # forward penalty + + if not self.remove_move_penalty: + if self.merge_move_penalty: + reward[1] = 0.0 if action == 1 else -1.0 + else: + reward[1] = 0.0 if action != 0 else -1.0 # reverse penalty + reward[2] = 0.0 if action != 2 else -1.0 # forward penalty + + if self.add_speed_objective: + reward[-1] = 15 * abs(velocity) self.state = (position, velocity) if self.render_mode == "human": diff --git a/mo_gymnasium/envs/mujoco/reacher_v4.py b/mo_gymnasium/envs/mujoco/reacher_v4.py index 01a5bc9d..9596f64c 100644 --- a/mo_gymnasium/envs/mujoco/reacher_v4.py +++ b/mo_gymnasium/envs/mujoco/reacher_v4.py @@ -13,7 +13,7 @@ class MOReacherEnv(ReacherEnv): """ ## Description - Mujoco version of `mo-reacher-v0`, based on [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/). + Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/). ## Observation Space The observation is 6-dimensional and contains: diff --git a/mo_gymnasium/envs/reacher/__init__.py b/mo_gymnasium/envs/reacher/__init__.py deleted file mode 100644 index b752382c..00000000 --- a/mo_gymnasium/envs/reacher/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from gymnasium.envs.registration import register - - -register( - id="mo-reacher-v0", - entry_point="mo_gymnasium.envs.reacher.reacher:ReacherBulletEnv", - max_episode_steps=100, - kwargs={"fixed_initial_state": None}, -) diff --git a/mo_gymnasium/envs/reacher/reacher.py b/mo_gymnasium/envs/reacher/reacher.py deleted file mode 100644 index f881f512..00000000 --- a/mo_gymnasium/envs/reacher/reacher.py +++ /dev/null @@ -1,158 +0,0 @@ -from typing import Optional - -import numpy as np -from gymnasium import spaces -from gymnasium.utils import EzPickle, seeding -from pybulletgym.envs.roboschool.envs.env_bases import BaseBulletEnv -from pybulletgym.envs.roboschool.robots.robot_bases import MJCFBasedRobot -from pybulletgym.envs.roboschool.scenes.scene_bases import SingleRobotEmptyScene - - -target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)])) - - -class ReacherBulletEnv(BaseBulletEnv, EzPickle): - metadata = {"render_modes": ["human", "rgb_array"]} - - def __init__( - self, - render_mode: Optional[str] = None, - target=(0.14, 0.0), - fixed_initial_state: Optional[tuple] = (3.14, 0), - ): - EzPickle.__init__(self, render_mode, target, fixed_initial_state) - self.robot = ReacherRobot(target, fixed_initial_state=fixed_initial_state) - self.render_mode = render_mode - BaseBulletEnv.__init__(self, self.robot, render=render_mode == "human") - self._cam_dist = 0.75 - - # self.target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14), (0.22, 0.0), (-0.22, 0.0), (0.0, 0.22), (0.0, -0.22), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)])) - # self.target_positions = list(map(lambda l: np.array(l), [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14), (0.1, 0.1), (0.1, -0.1), (-0.1, 0.1), (-0.1, -0.1)])) - self.target_positions = list( - map( - lambda l: np.array(l), - [(0.14, 0.0), (-0.14, 0.0), (0.0, 0.14), (0.0, -0.14)], - ) - ) - - actions = [-1.0, 0.0, 1.0] - self.action_dict = dict() - for a1 in actions: - for a2 in actions: - self.action_dict[len(self.action_dict)] = (a1, a2) - - self.action_space = spaces.Discrete(9) - self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32) - self.reward_space = spaces.Box(low=-1.0, high=1.0, shape=(4,), dtype=np.float32) - self.reward_dim = 4 - - def create_single_player_scene(self, bullet_client): - return SingleRobotEmptyScene(bullet_client, gravity=0.0, timestep=0.0165, frame_skip=1) - - def step(self, a): - real_action = self.action_dict[int(a)] - - assert not self.scene.multiplayer - self.robot.apply_action(real_action) - self.scene.global_step() - - state = self.robot.calc_state() # sets self.to_target_vec - - """ delta = np.linalg.norm(np.array(self.robot.fingertip.pose().xyz()) - np.array(self.robot.target.pose().xyz())) - reward = 1. - 4. * delta """ - - phi = np.zeros(len(self.target_positions), dtype=np.float32) - for index, target in enumerate(self.target_positions): - delta = np.linalg.norm(np.array(self.robot.fingertip.pose().xyz()[:2]) - target) - phi[index] = 1.0 - 4 * delta # 1 - 4 - - self.HUD(state, real_action, False) - - if self.render_mode == "human": - self._render(mode="human") - - return state, phi, False, False, {} - - def render(self): - if self.render_mode == "human": - self._render(mode="human") - else: - return self._render(mode="rgb_array") - - def camera_adjust(self): - x, y, z = self.robot.fingertip.pose().xyz() - x *= 0.5 - y *= 0.5 - self.camera.move_and_look_at(0.3, 0.3, 0.3, x, y, z) - - def reset(self, seed=None, **kwargs): - self._seed(seed) - if seed is not None: - self._np_random, seed = seeding.np_random(seed) - obs = super().reset() - if self.render_mode == "human": - self._render(mode="human") - return obs, {} - - -class ReacherRobot(MJCFBasedRobot): - TARG_LIMIT = 0.27 - - def __init__(self, target, fixed_initial_state=False): - MJCFBasedRobot.__init__(self, "reacher.xml", "body0", action_dim=2, obs_dim=4) - self.target_pos = target - self.fixed_initial_state = fixed_initial_state - - def robot_specific_reset(self, bullet_client): - self.jdict["target_x"].reset_current_position(target_positions[0][0], 0) - self.jdict["target_y"].reset_current_position(target_positions[0][1], 0) - - """ self.jdict["target2_x"].reset_current_position(target_positions[1][0], 0) - self.jdict["target2_y"].reset_current_position(target_positions[1][1], 0) - self.jdict["target3_x"].reset_current_position(target_positions[2][0], 0) - self.jdict["target3_y"].reset_current_position(target_positions[2][1], 0) - self.jdict["target4_x"].reset_current_position(target_positions[3][0], 0) - self.jdict["target4_y"].reset_current_position(target_positions[3][1], 0) """ - - self.fingertip = self.parts["fingertip"] - self.target = self.parts["target"] - self.central_joint = self.jdict["joint0"] - self.elbow_joint = self.jdict["joint1"] - if self.fixed_initial_state is None: - self.central_joint.reset_current_position(self.np_random.uniform(low=-3.14, high=3.14), 0) - self.elbow_joint.reset_current_position(self.np_random.uniform(low=-3.14 / 2, high=3.14 / 2), 0) - else: - self.central_joint.reset_current_position(0, 0) - self.elbow_joint.reset_current_position(self.fixed_initial_state[0], self.fixed_initial_state[1]) - - def apply_action(self, a): - assert np.isfinite(a).all() - self.central_joint.set_motor_torque(0.05 * float(np.clip(a[0], -1, +1))) - self.elbow_joint.set_motor_torque(0.05 * float(np.clip(a[1], -1, +1))) - - def calc_state(self): - theta, self.theta_dot = self.central_joint.current_relative_position() - self.gamma, self.gamma_dot = self.elbow_joint.current_relative_position() - # target_x, _ = self.jdict["target_x"].current_position() - # target_y, _ = self.jdict["target_y"].current_position() - self.to_target_vec = np.array(self.fingertip.pose().xyz()) - np.array(self.target.pose().xyz()) - return np.array( - [ - np.cos(theta), - np.sin(theta), - self.theta_dot * 0.1, - self.gamma, - self.gamma_dot * 0.1, - ], - dtype=np.float32, - ) - - -if __name__ == "__main__": - env = ReacherBulletEnv() - # env.render(mode='human') - obs = env.reset() - print(env.observation_space.contains(obs), obs.dtype, env.observation_space) - while True: - env.step(env.action_space.sample()) - # env.render(mode='human') diff --git a/mo_gymnasium/py.typed b/mo_gymnasium/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/mo_gymnasium/utils.py b/mo_gymnasium/utils.py index def90471..a5c41a14 100644 --- a/mo_gymnasium/utils.py +++ b/mo_gymnasium/utils.py @@ -1,14 +1,8 @@ -"""Utilities function such as wrappers.""" +"""Utilities functions.""" -import time -from copy import deepcopy -from typing import Iterator, Tuple, TypeVar +from typing import TypeVar import gymnasium as gym -import numpy as np -from gymnasium.vector import SyncVectorEnv -from gymnasium.wrappers.normalize import RunningMeanStd -from gymnasium.wrappers.record_episode_statistics import RecordEpisodeStatistics ObsType = TypeVar("ObsType") @@ -26,337 +20,3 @@ def make(env_name: str, disable_env_checker: bool = True, **kwargs) -> gym.Env: """ """Disable env checker, as it requires the reward to be a scalar.""" return gym.make(env_name, disable_env_checker=disable_env_checker, **kwargs) - - -class LinearReward(gym.Wrapper, gym.utils.RecordConstructorArgs): - """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector.""" - - def __init__(self, env: gym.Env, weight: np.ndarray = None): - """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector. - - Args: - env: env to wrap - weight: weight vector to use in the dot product - """ - gym.utils.RecordConstructorArgs.__init__(self, weight=weight) - gym.Wrapper.__init__(self, env) - if weight is None: - weight = np.ones(shape=env.unwrapped.reward_space.shape) - self.set_weight(weight) - - def set_weight(self, weight: np.ndarray): - """Changes weights for the scalarization. - - Args: - weight: new weights to set - Returns: nothing - """ - assert weight.shape == self.env.unwrapped.reward_space.shape, "Reward weight has different shape than reward vector." - self.w = weight - - def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]: - """Steps in the environment. - - Args: - action: action to perform - Returns: obs, scalarized_reward, terminated, truncated, info - """ - observation, reward, terminated, truncated, info = self.env.step(action) - scalar_reward = np.dot(reward, self.w) - info["vector_reward"] = reward - info["reward_weights"] = self.w - - return observation, scalar_reward, terminated, truncated, info - - -class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs): - """Wrapper to normalize the reward component at index idx. Does not touch other reward components.""" - - def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8): - """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. - - Args: - env (env): The environment to apply the wrapper - idx (int): the index of the reward to normalize - epsilon (float): A stability parameter - gamma (float): The discount factor that is used in the exponential moving average. - """ - gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon) - gym.Wrapper.__init__(self, env) - self.idx = idx - self.num_envs = getattr(env, "num_envs", 1) - self.is_vector_env = getattr(env, "is_vector_env", False) - self.return_rms = RunningMeanStd(shape=()) - self.returns = np.zeros(self.num_envs) - self.gamma = gamma - self.epsilon = epsilon - - def step(self, action: ActType): - """Steps through the environment, normalizing the rewards returned. - - Args: - action: action to perform - Returns: obs, normalized_rewards, terminated, truncated, infos - """ - obs, rews, terminated, truncated, infos = self.env.step(action) - # Extracts the objective value to normalize - to_normalize = rews[self.idx] - if not self.is_vector_env: - to_normalize = np.array([to_normalize]) - self.returns = self.returns * self.gamma + to_normalize - # Defer normalization to gym implementation - to_normalize = self.normalize(to_normalize) - self.returns[terminated] = 0.0 - if not self.is_vector_env: - to_normalize = to_normalize[0] - # Injecting the normalized objective value back into the reward vector - rews[self.idx] = to_normalize - return obs, rews, terminated, truncated, infos - - def normalize(self, rews): - """Normalizes the rewards with the running mean rewards and their variance. - - Args: - rews: rewards - Returns: the normalized reward - """ - self.return_rms.update(self.returns) - return rews / np.sqrt(self.return_rms.var + self.epsilon) - - -class MOClipReward(gym.RewardWrapper, gym.utils.RecordConstructorArgs): - """Clip reward[idx] to [min, max].""" - - def __init__(self, env: gym.Env, idx: int, min_r, max_r): - """Clip reward[idx] to [min, max]. - - Args: - env: environment to wrap - idx: index of the MO reward to clip - min_r: min reward - max_r: max reward - """ - gym.utils.RecordConstructorArgs.__init__(self, idx=idx, min_r=min_r, max_r=max_r) - gym.RewardWrapper.__init__(self, env) - self.idx = idx - self.min_r = min_r - self.max_r = max_r - - def reward(self, reward): - """Clips the reward at the given index. - - Args: - reward: reward to clip. - Returns: the clipped reward. - """ - reward[self.idx] = np.clip(reward[self.idx], self.min_r, self.max_r) - return reward - - -class MOSyncVectorEnv(SyncVectorEnv): - """Vectorized environment that serially runs multiple environments.""" - - def __init__( - self, - env_fns: Iterator[callable], - copy: bool = True, - ): - """Vectorized environment that serially runs multiple environments. - - Args: - env_fns: env constructors - copy: If ``True``, then the :meth:`reset` and :meth:`step` methods return a copy of the observations. - """ - SyncVectorEnv.__init__(self, env_fns, copy=copy) - # Just overrides the rewards memory to add the number of objectives - self.reward_space = self.envs[0].unwrapped.reward_space - self._rewards = np.zeros( - ( - self.num_envs, - self.reward_space.shape[0], - ), - dtype=np.float64, - ) - - -class MORecordEpisodeStatistics(RecordEpisodeStatistics, gym.utils.RecordConstructorArgs): - """This wrapper will keep track of cumulative rewards and episode lengths. - - After the completion of an episode, ``info`` will look like this:: - - >>> info = { - ... "episode": { - ... "r": "", - ... "dr": "", - ... "l": "", # contrary to Gymnasium, these are not a numpy array - ... "t": "" - ... }, - ... } - - For a vectorized environments the output will be in the form of (be careful to first wrap the env into vector before applying MORewordStatistics):: - - >>> infos = { - ... "final_observation": "", - ... "_final_observation": "", - ... "final_info": "", - ... "_final_info": "", - ... "episode": { - ... "r": "", - ... "dr": "", - ... "l": "", - ... "t": "" - ... }, - ... "_episode": "" - ... } - """ - - def __init__(self, env: gym.Env, gamma: float = 1.0, deque_size: int = 100): - """This wrapper will keep track of cumulative rewards and episode lengths. - - Args: - env (Env): The environment to apply the wrapper - gamma (float): Discounting factor - deque_size: The size of the buffers :attr:`return_queue` and :attr:`length_queue` - """ - gym.utils.RecordConstructorArgs.__init__(self, gamma=gamma, deque_size=deque_size) - RecordEpisodeStatistics.__init__(self, env, deque_size=deque_size) - # CHANGE: Here we just override the standard implementation to extend to MO - # We also take care of the case where the env is vectorized - self.reward_dim = self.env.unwrapped.reward_space.shape[0] - if self.is_vector_env: - self.rewards_shape = (self.num_envs, self.reward_dim) - else: - self.rewards_shape = (self.reward_dim,) - self.gamma = gamma - - def reset(self, **kwargs): - """Resets the environment using kwargs and resets the episode returns and lengths.""" - obs, info = super().reset(**kwargs) - - # CHANGE: Here we just override the standard implementation to extend to MO - self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) - self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) - - return obs, info - - def step(self, action): - """Steps through the environment, recording the episode statistics.""" - # This is very close the code from the RecordEpisodeStatistics wrapper from gym. - ( - observations, - rewards, - terminations, - truncations, - infos, - ) = self.env.step(action) - assert isinstance( - infos, dict - ), f"`info` dtype is {type(infos)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order." - self.episode_returns += rewards - self.episode_lengths += 1 - - # CHANGE: The discounted returns are also computed here - self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( - self.episode_returns.shape - ) - - dones = np.logical_or(terminations, truncations) - num_dones = np.sum(dones) - if num_dones: - if "episode" in infos or "_episode" in infos: - raise ValueError("Attempted to add episode stats when they already exist") - else: - episode_return = np.zeros(self.rewards_shape, dtype=np.float32) - disc_episode_return = np.zeros(self.rewards_shape, dtype=np.float32) - if self.is_vector_env: - for i in range(self.num_envs): - if dones[i]: - # CHANGE: Makes a deepcopy to avoid subsequent mutations - episode_return[i] = deepcopy(self.episode_returns[i]) - disc_episode_return[i] = deepcopy(self.disc_episode_returns[i]) - else: - episode_return = deepcopy(self.episode_returns) - disc_episode_return = deepcopy(self.disc_episode_returns) - - length_eps = np.where(dones, self.episode_lengths, 0) - time_eps = np.where( - dones, - np.round(time.perf_counter() - self.episode_start_times, 6), - 0.0, - ) - - infos["episode"] = { - "r": episode_return, - "dr": disc_episode_return, - "l": length_eps[0] if not self.is_vector_env else length_eps, - "t": time_eps[0] if not self.is_vector_env else time_eps, - } - if self.is_vector_env: - infos["_episode"] = np.where(dones, True, False) - self.return_queue.extend(self.episode_returns[dones]) - self.length_queue.extend(self.episode_lengths[dones]) - self.episode_count += num_dones - self.episode_lengths[dones] = 0 - self.episode_returns[dones] = np.zeros(self.reward_dim, dtype=np.float32) - self.disc_episode_returns[dones] = np.zeros(self.reward_dim, dtype=np.float32) - self.episode_start_times[dones] = time.perf_counter() - return ( - observations, - rewards, - terminations, - truncations, - infos, - ) - - -class MOMaxAndSkipObservation(gym.Wrapper): - """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last observations. - - Note: This wrapper is based on the wrapper from stable-baselines3: https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html#MaxAndSkipEnv - """ - - def __init__(self, env: gym.Env[ObsType, ActType], skip: int = 4): - """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last frames. - - Args: - env (Env): The environment to apply the wrapper - skip: The number of frames to skip - """ - gym.Wrapper.__init__(self, env) - - if not np.issubdtype(type(skip), np.integer): - raise TypeError(f"The skip is expected to be an integer, actual type: {type(skip)}") - if skip < 2: - raise ValueError(f"The skip value needs to be equal or greater than two, actual value: {skip}") - if env.observation_space.shape is None: - raise ValueError("The observation space must have the shape attribute.") - - self._skip = skip - self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) - - def step(self, action): - """Step the environment with the given action for ``skip`` steps. - - Repeat action, sum reward, and max over last observations. - - Args: - action: The action to step through the environment with - Returns: - Max of the last two observations, reward, terminated, truncated, and info from the environment - """ - total_reward = np.zeros(self.env.unwrapped.reward_dim, dtype=np.float32) - terminated = truncated = False - info = {} - for i in range(self._skip): - obs, reward, terminated, truncated, info = self.env.step(action) - done = terminated or truncated - if i == self._skip - 2: - self._obs_buffer[0] = obs - if i == self._skip - 1: - self._obs_buffer[1] = obs - total_reward += reward - if done: - break - max_frame = self._obs_buffer.max(axis=0) - - return max_frame, total_reward, terminated, truncated, info diff --git a/mo_gymnasium/wrappers/__init__.py b/mo_gymnasium/wrappers/__init__.py new file mode 100644 index 00000000..274241a0 --- /dev/null +++ b/mo_gymnasium/wrappers/__init__.py @@ -0,0 +1,10 @@ +"""Contains all wrappers (vectors or not).""" + +from mo_gymnasium.wrappers import vector +from mo_gymnasium.wrappers.wrappers import ( + LinearReward, + MOClipReward, + MOMaxAndSkipObservation, + MONormalizeReward, + MORecordEpisodeStatistics, +) diff --git a/mo_gymnasium/wrappers/vector/__init__.py b/mo_gymnasium/wrappers/vector/__init__.py new file mode 100644 index 00000000..60225b17 --- /dev/null +++ b/mo_gymnasium/wrappers/vector/__init__.py @@ -0,0 +1,6 @@ +"""Vector wrappers.""" + +from mo_gymnasium.wrappers.vector.wrappers import ( + MORecordEpisodeStatistics, + MOSyncVectorEnv, +) diff --git a/mo_gymnasium/wrappers/vector/wrappers.py b/mo_gymnasium/wrappers/vector/wrappers.py new file mode 100644 index 00000000..6028061d --- /dev/null +++ b/mo_gymnasium/wrappers/vector/wrappers.py @@ -0,0 +1,227 @@ +"""Vector wrappers.""" + +import time +from copy import deepcopy +from typing import Any, Dict, Iterator, Tuple + +import gymnasium as gym +import numpy as np +from gymnasium.core import ActType, ObsType +from gymnasium.vector import SyncVectorEnv +from gymnasium.vector.utils import concatenate, iterate +from gymnasium.vector.vector_env import ArrayType, VectorEnv +from gymnasium.wrappers.vector import RecordEpisodeStatistics + + +class MOSyncVectorEnv(SyncVectorEnv): + """Vectorized environment that serially runs multiple environments. + + Example: + >>> import mo_gymnasium as mo_gym + + >>> envs = mo_gym.wrappers.vector.MOSyncVectorEnv([ + ... lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(4) + ... ]) + >>> envs + MOSyncVectorEnv(num_envs=4) + >>> obs, infos = envs.reset() + >>> obs + array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype=int32) + >>> _ = envs.action_space.seed(42) + >>> actions = envs.action_space.sample() + >>> obs, rewards, terminateds, truncateds, infos = envs.step([0, 1, 2, 3]) + >>> obs + array([[0, 0], [1, 0], [0, 0], [0, 3]], dtype=int32) + >>> rewards + array([[0., -1.], [0.7, -1.], [0., -1.], [0., -1.]], dtype=float32) + >>> terminateds + array([False, True, False, False]) + """ + + def __init__( + self, + env_fns: Iterator[callable], + copy: bool = True, + ): + """Vectorized environment that serially runs multiple environments. + + Args: + env_fns: env constructors + copy: If ``True``, then the :meth:`reset` and :meth:`step` methods return a copy of the observations. + """ + SyncVectorEnv.__init__(self, env_fns, copy=copy) + # Just overrides the rewards memory to add the number of objectives + self.reward_space = self.envs[0].unwrapped.reward_space + self._rewards = np.zeros( + ( + self.num_envs, + self.reward_space.shape[0], + ), + dtype=np.float32, + ) + + def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayType, Dict[str, Any]]: + """Steps through each of the environments returning the batched results. + + Returns: + The batched environment step results + """ + actions = iterate(self.action_space, actions) + + observations, infos = [], {} + for i, action in enumerate(actions): + if self._autoreset_envs[i]: + env_obs, env_info = self.envs[i].reset() + + self._rewards[i] = np.zeros(self.reward_space.shape[0]) # This overrides Gymnasium's implem + self._terminations[i] = False + self._truncations[i] = False + else: + ( + env_obs, + self._rewards[i], + self._terminations[i], + self._truncations[i], + env_info, + ) = self.envs[ + i + ].step(action) + + observations.append(env_obs) + infos = self._add_info(infos, env_info, i) + + # Concatenate the observations + self._observations = concatenate(self.single_observation_space, observations, self._observations) + self._autoreset_envs = np.logical_or(self._terminations, self._truncations) + + return ( + deepcopy(self._observations) if self.copy else self._observations, + np.copy(self._rewards), + np.copy(self._terminations), + np.copy(self._truncations), + infos, + ) + + +class MORecordEpisodeStatistics(RecordEpisodeStatistics): + """This wrapper will keep track of cumulative rewards and episode lengths. + + At the end of any episode within the vectorized env, the statistics of the episode + will be added to ``info`` using the key ``episode``, and the ``_episode`` key + is used to indicate the environment index which has a terminated or truncated episode. + + For a vectorized environments the output will be in the form of (be careful to first wrap the env into vector before applying MORewordStatistics):: + + >>> infos = { # doctest: +SKIP + ... "episode": { + ... "r": "", + ... "dr": "", + ... "l": "", + ... "t": "" + ... }, + ... "_episode": "" + ... } + + Moreover, the most recent rewards and episode lengths are stored in buffers that can be accessed via + :attr:`wrapped_env.return_queue` and :attr:`wrapped_env.length_queue` respectively. + + Attributes: + return_queue: The cumulative rewards of the last ``deque_size``-many episodes + length_queue: The lengths of the last ``deque_size``-many episodes + """ + + def __init__( + self, + env: VectorEnv, + gamma: float = 1.0, + buffer_length: int = 100, + stats_key: str = "episode", + ): + """This wrapper will keep track of cumulative rewards and episode lengths. + + Args: + env (Env): The environment to apply the wrapper + gamma: The discount factor + buffer_length: The size of the buffers :attr:`return_queue`, :attr:`length_queue` and :attr:`time_queue` + stats_key: The info key to save the data + """ + gym.utils.RecordConstructorArgs.__init__(self, buffer_length=buffer_length, stats_key=stats_key) + RecordEpisodeStatistics.__init__(self, env, buffer_length=buffer_length, stats_key=stats_key) + self.disc_episode_returns = None + self.reward_dim = self.env.unwrapped.reward_space.shape[0] + self.rewards_shape = (self.num_envs, self.reward_dim) + self.gamma = gamma + + def reset(self, **kwargs): + """Resets the environment using kwargs and resets the episode returns and lengths.""" + obs, info = super().reset(**kwargs) + + # CHANGE: Here we just override the standard implementation to extend to MO + self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + + return obs, info + + def step(self, actions: ActType) -> Tuple[ObsType, ArrayType, ArrayType, ArrayType, Dict[str, Any]]: + """Steps through the environment, recording the episode statistics.""" + ( + observations, + rewards, + terminations, + truncations, + infos, + ) = self.env.step(actions) + + assert isinstance( + infos, dict + ), f"`vector.RecordEpisodeStatistics` requires `info` type to be `dict`, its actual type is {type(infos)}. This may be due to usage of other wrappers in the wrong order." + + self.episode_returns[self.prev_dones] = 0 + self.episode_lengths[self.prev_dones] = 0 + self.episode_start_times[self.prev_dones] = time.perf_counter() + self.episode_returns[~self.prev_dones] += rewards[~self.prev_dones] + + # CHANGE: The discounted returns are also computed here + self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( + self.episode_returns.shape + ) + self.episode_lengths[~self.prev_dones] += 1 + + self.prev_dones = dones = np.logical_or(terminations, truncations) + num_dones = np.sum(dones) + if num_dones: + if self._stats_key in infos or f"_{self._stats_key}" in infos: + raise ValueError(f"Attempted to add episode stats when they already exist, info keys: {list(infos.keys())}") + else: + # CHANGE to handle the vectorial reward and do deepcopies + episode_return = np.zeros(self.rewards_shape, dtype=np.float32) + disc_episode_return = np.zeros(self.rewards_shape, dtype=np.float32) + + for i in range(self.num_envs): + if dones[i]: + episode_return[i] = np.copy(self.episode_returns[i]) + disc_episode_return[i] = np.copy(self.disc_episode_returns[i]) + + episode_time_length = np.round(time.perf_counter() - self.episode_start_times, 6) + infos[self._stats_key] = { + "r": episode_return, + "dr": disc_episode_return, + "l": np.where(dones, self.episode_lengths, 0), + "t": np.where(dones, episode_time_length, 0.0), + } + infos[f"_{self._stats_key}"] = dones + + self.episode_count += num_dones + + for i in np.where(dones): + self.time_queue.extend(episode_time_length[i]) + self.return_queue.extend(self.episode_returns[i]) + self.length_queue.extend(self.episode_lengths[i]) + + return ( + observations, + rewards, + terminations, + truncations, + infos, + ) diff --git a/mo_gymnasium/wrappers/wrappers.py b/mo_gymnasium/wrappers/wrappers.py new file mode 100644 index 00000000..f7830865 --- /dev/null +++ b/mo_gymnasium/wrappers/wrappers.py @@ -0,0 +1,305 @@ +"""Wrappers.""" + +import time +from copy import deepcopy +from typing import Tuple, TypeVar + +import gymnasium as gym +import numpy as np +from gymnasium.wrappers.common import RecordEpisodeStatistics +from gymnasium.wrappers.utils import RunningMeanStd + + +ObsType = TypeVar("ObsType") +ActType = TypeVar("ActType") + + +class LinearReward(gym.Wrapper, gym.utils.RecordConstructorArgs): + """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector.""" + + def __init__(self, env: gym.Env, weight: np.ndarray = None): + """Makes the env return a scalar reward, which is the dot-product between the reward vector and the weight vector. + + Args: + env: env to wrap + weight: weight vector to use in the dot product + """ + gym.utils.RecordConstructorArgs.__init__(self, weight=weight) + gym.Wrapper.__init__(self, env) + if weight is None: + weight = np.ones(shape=env.unwrapped.reward_space.shape) + self.set_weight(weight) + + def set_weight(self, weight: np.ndarray): + """Changes weights for the scalarization. + + Args: + weight: new weights to set + Returns: nothing + """ + assert weight.shape == self.env.unwrapped.reward_space.shape, "Reward weight has different shape than reward vector." + self.w = weight + + def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]: + """Steps in the environment. + + Args: + action: action to perform + Returns: obs, scalarized_reward, terminated, truncated, info + """ + observation, reward, terminated, truncated, info = self.env.step(action) + scalar_reward = np.dot(reward, self.w) + info["vector_reward"] = reward + info["reward_weights"] = self.w + + return observation, scalar_reward, terminated, truncated, info + + +class MONormalizeReward(gym.Wrapper, gym.utils.RecordConstructorArgs): + """Wrapper to normalize the reward component at index idx. Does not touch other reward components. + + This code is heavily inspired on Gymnasium's except that it extracts the reward component at given idx, normalizes it, and reinjects it. + + (!) This smoothes the moving average of the reward, which can be useful for training stability. But it does not "normalize" the reward in the sense of making it have a mean of 0 and a standard deviation of 1. + + Example: + >>> import mo_gymnasium as mo_gym + >>> from mo_gymnasium.wrappers import MONormalizeReward + >>> env = mo_gym.make("deep-sea-treasure-v0") + >>> norm_treasure_env = MONormalizeReward(env, idx=0) + >>> both_norm_env = MONormalizeReward(norm_treasure_env, idx=1) + >>> both_norm_env.reset() # This one normalizes both rewards + + """ + + def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = 1e-8): + """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. + + Args: + env (env): The environment to apply the wrapper + idx (int): the index of the reward to normalize + epsilon (float): A stability parameter + gamma (float): The discount factor that is used in the exponential moving average. + """ + gym.utils.RecordConstructorArgs.__init__(self, idx=idx, gamma=gamma, epsilon=epsilon) + gym.Wrapper.__init__(self, env) + self.idx = idx + self.return_rms = RunningMeanStd(shape=()) + self.discounted_reward: np.array = np.array([0.0]) + self.gamma = gamma + self.epsilon = epsilon + self._update_running_mean = True + + @property + def update_running_mean(self) -> bool: + """Property to freeze/continue the running mean calculation of the reward statistics.""" + return self._update_running_mean + + @update_running_mean.setter + def update_running_mean(self, setting: bool): + """Sets the property to freeze/continue the running mean calculation of the reward statistics.""" + self._update_running_mean = setting + + def step(self, action: ActType): + """Steps through the environment, normalizing the rewards returned. + + Args: + action: action to perform + Returns: obs, normalized_rewards, terminated, truncated, infos + """ + obs, rews, terminated, truncated, infos = self.env.step(action) + # Extracts the objective value to normalize + to_normalize = rews[self.idx] + + self.discounted_reward = self.discounted_reward * self.gamma * (1 - terminated) + float(to_normalize) + if self._update_running_mean: + self.return_rms.update(self.discounted_reward) + + # We don't (reward - self.return_rms.mean) see https://github.com/openai/baselines/issues/538 + normalized_reward = to_normalize / np.sqrt(self.return_rms.var + self.epsilon) + + # Injecting the normalized objective value back into the reward vector + rews[self.idx] = normalized_reward + return obs, rews, terminated, truncated, infos + + +class MOClipReward(gym.RewardWrapper, gym.utils.RecordConstructorArgs): + """Clip reward[idx] to [min, max].""" + + def __init__(self, env: gym.Env, idx: int, min_r, max_r): + """Clip reward[idx] to [min, max]. + + Args: + env: environment to wrap + idx: index of the MO reward to clip + min_r: min reward + max_r: max reward + """ + gym.utils.RecordConstructorArgs.__init__(self, idx=idx, min_r=min_r, max_r=max_r) + gym.RewardWrapper.__init__(self, env) + self.idx = idx + self.min_r = min_r + self.max_r = max_r + + def reward(self, reward): + """Clips the reward at the given index. + + Args: + reward: reward to clip. + Returns: the clipped reward. + """ + reward[self.idx] = np.clip(reward[self.idx], self.min_r, self.max_r) + return reward + + +class MORecordEpisodeStatistics(RecordEpisodeStatistics, gym.utils.RecordConstructorArgs): + """This wrapper will keep track of cumulative rewards and episode lengths. + + After the completion of an episode, ``info`` will look like this:: + + >>> info = { + ... "episode": { + ... "r": "", + ... "dr": "", + ... "l": "", + ... "t": "" + ... }, + ... } + """ + + def __init__( + self, + env: gym.Env, + gamma: float = 1.0, + buffer_length: int = 100, + stats_key: str = "episode", + ): + """This wrapper will keep track of cumulative rewards and episode lengths. + + Args: + env (Env): The environment to apply the wrapper + gamma (float): Discounting factor + buffer_length: The size of the buffers :attr:`return_queue`, :attr:`length_queue` and :attr:`time_queue` + stats_key: The info key for the episode statistics + """ + gym.utils.RecordConstructorArgs.__init__(self, gamma=gamma, buffer_length=buffer_length, stats_key=stats_key) + RecordEpisodeStatistics.__init__(self, env, buffer_length=buffer_length, stats_key=stats_key) + # CHANGE: Here we just override the standard implementation to extend to MO + self.reward_dim = self.env.unwrapped.reward_space.shape[0] + self.rewards_shape = (self.reward_dim,) + self.gamma = gamma + + def step(self, action): + """Steps through the environment, recording the episode statistics.""" + # This is very close the code from the RecordEpisodeStatistics wrapper from Gymnasium. + ( + observation, + rewards, + terminated, + truncated, + info, + ) = self.env.step(action) + assert isinstance( + info, dict + ), f"`info` dtype is {type(info)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order." + self.episode_returns += rewards + + # CHANGE: The discounted returns are also computed here + self.disc_episode_returns += rewards * np.repeat(self.gamma**self.episode_lengths, self.reward_dim).reshape( + self.episode_returns.shape + ) + self.episode_lengths += 1 + + if terminated or truncated: + assert self._stats_key not in info + + episode_time_length = round(time.perf_counter() - self.episode_start_time, 6) + + # Make a deepcopy to void subsequent mutation of the numpy array + episode_returns = deepcopy(self.episode_returns) + disc_episode_returns = deepcopy(self.disc_episode_returns) + + info["episode"] = { + "r": episode_returns, + "dr": disc_episode_returns, + "l": self.episode_lengths, + "t": episode_time_length, + } + + self.time_queue.append(episode_time_length) + self.return_queue.append(episode_returns) + self.length_queue.append(self.episode_lengths) + + self.episode_count += 1 + self.episode_start_time = time.perf_counter() + + return ( + observation, + rewards, + terminated, + truncated, + info, + ) + + def reset(self, **kwargs): + """Resets the environment using kwargs and resets the episode returns and lengths.""" + obs, info = super().reset(**kwargs) + + # CHANGE: Here we just override the standard implementation to extend to MO + self.episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + self.disc_episode_returns = np.zeros(self.rewards_shape, dtype=np.float32) + + return obs, info + + +class MOMaxAndSkipObservation(gym.Wrapper): + """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last observations. + + Note: This wrapper is based on the wrapper from stable-baselines3: https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/common/atari_wrappers.html#MaxAndSkipEnv + """ + + def __init__(self, env: gym.Env[ObsType, ActType], skip: int = 4): + """This wrapper will return only every ``skip``-th frame (frameskipping) and return the max between the two last frames. + + Args: + env (Env): The environment to apply the wrapper + skip: The number of frames to skip + """ + gym.Wrapper.__init__(self, env) + + if not np.issubdtype(type(skip), np.integer): + raise TypeError(f"The skip is expected to be an integer, actual type: {type(skip)}") + if skip < 2: + raise ValueError(f"The skip value needs to be equal or greater than two, actual value: {skip}") + if env.observation_space.shape is None: + raise ValueError("The observation space must have the shape attribute.") + + self._skip = skip + self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) + + def step(self, action): + """Step the environment with the given action for ``skip`` steps. + + Repeat action, sum reward, and max over last observations. + + Args: + action: The action to step through the environment with + Returns: + Max of the last two observations, reward, terminated, truncated, and info from the environment + """ + total_reward = np.zeros(self.env.unwrapped.reward_dim, dtype=np.float32) + terminated = truncated = False + info = {} + for i in range(self._skip): + obs, reward, terminated, truncated, info = self.env.step(action) + done = terminated or truncated + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += reward + if done: + break + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, terminated, truncated, info diff --git a/pyproject.toml b/pyproject.toml index 160f2b53..d4b42d55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" requires-python = ">= 3.8" authors = [{ name = "Farama Foundation", email = "contact@farama.org" }] license = { text = "MIT License" } -keywords = ["Reinforcement Learning", "Multi-Objective", "RL", "AI", "gymnasium"] +keywords = ["Reinforcement Learning", "Multi-Objective", "RL", "AI", "Gymnasium"] classifiers = [ "Development Status :: 4 - Beta", # change to `5 - Production/Stable` when ready "License :: OSI Approved :: MIT License", @@ -22,8 +22,8 @@ classifiers = [ 'Topic :: Scientific/Engineering :: Artificial Intelligence', ] dependencies = [ - "gymnasium>=0.28.1,<0.30", - "numpy >=1.21.0", + "gymnasium >=1.0.0", + "numpy >=1.21.0,<2.0", "pygame >=2.1.0", "scipy >=1.7.3", "pymoo >=0.6.0", @@ -49,7 +49,7 @@ all = [ "imageio >=2.14.1", "mujoco >=2.2.0", # highway - "highway-env >= 1.8", + "highway-env >= 1.9.1", # box2d "box2d-py ==2.3.5", "pygame ==2.1.3.dev8", @@ -73,12 +73,12 @@ include = ["mo_gymnasium", "mo_gymnasium.*"] mo_gymnasium = [ "**/*.json", "**/assets/*", + "py.typed" ] # Linters and Test tools ####################################################### [tool.black] -safe = true line-length = 127 target-version = ['py38', 'py39', 'py310', 'py311'] include = '\.pyi?$' diff --git a/tests/test_envs.py b/tests/test_envs.py index 28af4b0c..568cc6b6 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -14,6 +14,7 @@ for env_spec in gym.envs.registry.values(): if type(env_spec.entry_point) is not str: continue + # collect MO Gymnasium envs if env_spec.entry_point.split(".")[0] == "mo_gymnasium": all_testing_env_specs.append(env_spec) @@ -27,7 +28,7 @@ def test_all_env_api(spec): """Check that all environments pass the environment checker.""" env = mo_gym.make(spec.id) - env = mo_gym.LinearReward(env) + env = mo_gym.wrappers.LinearReward(env) check_env(env, skip_render_check=True) _test_reward_bounds(env.unwrapped) _test_pickle_env(env) @@ -46,7 +47,7 @@ def test_all_env_passive_env_checker(spec): [ ("MountainCar-v0", "mo-mountaincar-v0"), ("MountainCarContinuous-v0", "mo-mountaincarcontinuous-v0"), - ("LunarLander-v2", "mo-lunar-lander-v2"), + ("LunarLander-v3", "mo-lunar-lander-v3"), # ("Reacher-v4", "mo-reacher-v4"), # use a different model and action space ("Hopper-v4", "mo-hopper-v4"), ("HalfCheetah-v4", "mo-halfcheetah-v4"), @@ -58,7 +59,7 @@ def test_all_env_passive_env_checker(spec): ) def test_gymnasium_equivalence(gym_id, mo_gym_id, num_steps=100, seed=123): env = gym.make(gym_id) - mo_env = mo_gym.LinearReward(mo_gym.make(mo_gym_id)) + mo_env = mo_gym.wrappers.LinearReward(mo_gym.make(mo_gym_id)) # for float rewards, then precision becomes an issue env = gym.wrappers.TransformReward(env, lambda reward: round(reward, 4)) @@ -93,8 +94,8 @@ def test_env_determinism_rollout(env_spec: EnvSpec): env_1 = mo_gym.make(env_spec.id) env_2 = mo_gym.make(env_spec.id) - env_1 = mo_gym.LinearReward(env_1) - env_2 = mo_gym.LinearReward(env_2) + env_1 = mo_gym.wrappers.LinearReward(env_1) + env_2 = mo_gym.wrappers.LinearReward(env_2) initial_obs_1, initial_info_1 = env_1.reset(seed=SEED) initial_obs_2, initial_info_2 = env_2.reset(seed=SEED) @@ -156,7 +157,7 @@ def assert_equals(a, b, prefix=None): b: second data structure prefix: prefix for failed assertion message for types and dicts """ - assert type(a) == type(b), f"{prefix}Differing types: {a} and {b}" + assert type(a) is type(b), f"{prefix}Differing types: {a} and {b}" if isinstance(a, dict): assert list(a.keys()) == list(b.keys()), f"{prefix}Key sets differ: {a} and {b}" @@ -190,7 +191,7 @@ def test_ccs_dst(): np.array([19.778, -17.383]), ] - discounted_front = env.pareto_front(gamma=0.99) + discounted_front = env.unwrapped.pareto_front(gamma=0.99) for desired, actual in zip(known_ccs, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -200,7 +201,7 @@ def test_ccs_dst_no_discount(): known_ccs = mo_gym.envs.deep_sea_treasure.deep_sea_treasure.CONVEX_FRONT - discounted_front = env.pareto_front(gamma=1.0) + discounted_front = env.unwrapped.pareto_front(gamma=1.0) for desired, actual in zip(known_ccs, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -223,7 +224,7 @@ def test_concave_pf_dst(): np.array([124.0 * gamma**18, -17.383]), ] - discounted_front = env.pareto_front(gamma=0.99) + discounted_front = env.unwrapped.pareto_front(gamma=0.99) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -233,7 +234,7 @@ def test_concave_pf_dst_no_discount(): known_pf = mo_gym.envs.deep_sea_treasure.deep_sea_treasure.CONCAVE_FRONT - discounted_front = env.pareto_front(gamma=1.0) + discounted_front = env.unwrapped.pareto_front(gamma=1.0) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -244,7 +245,7 @@ def test_pf_fruit_tree(): known_pf = np.array(mo_gym.envs.fruit_tree.fruit_tree.FRUITS[str(depth)]) * (0.99 ** (depth - 1)) - discounted_front = env.pareto_front(gamma=0.99) + discounted_front = env.unwrapped.pareto_front(gamma=0.99) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) @@ -255,6 +256,6 @@ def test_pf_fruit_tree_no_discount(): known_pf = mo_gym.envs.fruit_tree.fruit_tree.FRUITS[str(depth)] - discounted_front = env.pareto_front(gamma=1.0) + discounted_front = env.unwrapped.pareto_front(gamma=1.0) for desired, actual in zip(known_pf, discounted_front): np.testing.assert_array_almost_equal(desired, actual, decimal=2) diff --git a/tests/test_vector_wrappers.py b/tests/test_vector_wrappers.py new file mode 100644 index 00000000..d57d7567 --- /dev/null +++ b/tests/test_vector_wrappers.py @@ -0,0 +1,89 @@ +import gymnasium as gym +import numpy as np + +import mo_gymnasium as mo_gym +from mo_gymnasium.wrappers.vector import MORecordEpisodeStatistics, MOSyncVectorEnv + + +def test_mo_sync_wrapper(): + num_envs = 3 + envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)]) + + envs.reset() + obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) + assert len(obs) == num_envs, "Number of observations do not match the number of envs" + assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" + assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" + assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" + envs.close() + + +def test_mo_sync_autoreset(): + num_envs = 2 + envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)]) + + obs, infos = envs.reset() + assert (obs[0] == [0, 0]).all() + assert (obs[1] == [0, 0]).all() + obs, rewards, terminateds, truncateds, infos = envs.step([0, 1]) + assert (obs[0] == [0, 0]).all() + assert (obs[1] == [1, 0]).all() + # Use np assert almost equal to avoid floating point errors + np.testing.assert_almost_equal(rewards[0], np.array([0.0, -1.0], dtype=np.float32), decimal=2) + np.testing.assert_almost_equal(rewards[1], np.array([0.7, -1.0], dtype=np.float32), decimal=2) + assert not terminateds[0] + assert terminateds[1] # This one is done + assert not truncateds[0] + assert not truncateds[1] + obs, rewards, terminateds, truncateds, infos = envs.step([0, 1]) + assert (obs[0] == [0, 0]).all() + assert (obs[1] == [0, 0]).all() + assert (rewards[0] == [0.0, -1.0]).all() + assert (rewards[1] == [0.0, 0.0]).all() # Reset step + assert not terminateds[0] + assert not terminateds[1] # Not done anymore + envs.close() + + +def test_mo_record_ep_statistic_vector_env(): + num_envs = 2 + envs = MOSyncVectorEnv([lambda: mo_gym.make("deep-sea-treasure-v0") for _ in range(num_envs)]) + envs = MORecordEpisodeStatistics(envs, gamma=0.97) + + envs.reset() + terminateds = np.array([False] * num_envs) + info = {} + obs, rewards, terminateds, _, info = envs.step([0, 3]) + obs, rewards, terminateds, _, info = envs.step([0, 1]) + obs, rewards, terminateds, _, info = envs.step([0, 1]) + + assert isinstance(info["episode"]["r"], np.ndarray) + assert isinstance(info["episode"]["dr"], np.ndarray) + # Episode records are vectorized because multiple environments + assert info["episode"]["r"].shape == (num_envs, 2) + np.testing.assert_almost_equal(info["episode"]["r"][0], np.array([0.0, 0.0], dtype=np.float32), decimal=2) + np.testing.assert_almost_equal(info["episode"]["r"][1], np.array([8.2, -3.0], dtype=np.float32), decimal=2) + assert info["episode"]["dr"].shape == (num_envs, 2) + np.testing.assert_almost_equal(info["episode"]["dr"][0], np.array([0.0, 0.0], dtype=np.float32), decimal=2) + np.testing.assert_almost_equal(info["episode"]["dr"][1], np.array([7.72, -2.91], dtype=np.float32), decimal=2) + assert isinstance(info["episode"]["l"], np.ndarray) + np.testing.assert_almost_equal(info["episode"]["l"], np.array([0, 3], dtype=np.float32), decimal=2) + assert isinstance(info["episode"]["t"], np.ndarray) + envs.close() + + +def test_gym_wrapper_and_vector(): + # This tests the integration of gym-wrapped envs with MO-Gymnasium vectorized envs + num_envs = 2 + envs = MOSyncVectorEnv( + [lambda: gym.wrappers.NormalizeObservation(mo_gym.make("deep-sea-treasure-v0")) for _ in range(num_envs)] + ) + + envs.reset() + for i in range(30): + obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) + assert len(obs) == num_envs, "Number of observations do not match the number of envs" + assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" + assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" + assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" + envs.close() diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 9cf42354..df2ded4a 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -1,11 +1,10 @@ import numpy as np import mo_gymnasium as mo_gym -from mo_gymnasium import ( +from mo_gymnasium.wrappers import ( MOClipReward, MONormalizeReward, MORecordEpisodeStatistics, - MOSyncVectorEnv, ) @@ -14,35 +13,42 @@ def go_to_8_3(env): Goes to (8.2, -3) treasure, returns the rewards """ env.reset() - env.step(3) # right - env.step(1) # down - _, rewards, _, _, infos = env.step(1) + env.step(3) # action: right, rewards: [0, -1] + env.step(1) # action: down, rewards: [0, -1] + _, rewards, _, _, infos = env.step(1) # action: down, rewards: [8.2, -1] return rewards, infos def test_normalization_wrapper(): + # Watch out that the wrapper does not normalize the rewards to have a mean of 0 and std of 1 + # instead it smoothens the moving average of the rewards env = mo_gym.make("deep-sea-treasure-v0") norm_treasure_env = MONormalizeReward(env, idx=0) both_norm_env = MONormalizeReward(norm_treasure_env, idx=1) + # No normalization + env.reset(seed=0) + _, rewards, _, _, _ = env.step(1) + np.testing.assert_almost_equal(rewards, [0.7, -1.0], decimal=2) + # Tests for both rewards normalized for i in range(30): go_to_8_3(both_norm_env) - both_norm_env.reset() + both_norm_env.reset(seed=0) _, rewards, _, _, _ = both_norm_env.step(1) # down - np.testing.assert_allclose(rewards, [0.18, -1.24], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [0.5, -1.24], decimal=2) rewards, _ = go_to_8_3(both_norm_env) - np.testing.assert_allclose(rewards, [2.13, -1.24], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [4.73, -1.24], decimal=2) # Tests for only treasure normalized for i in range(30): go_to_8_3(norm_treasure_env) - norm_treasure_env.reset() + norm_treasure_env.reset(seed=0) _, rewards, _, _, _ = norm_treasure_env.step(1) # down # Time rewards are not normalized (-1) - np.testing.assert_allclose(rewards, [0.18, -1.0], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [0.51, -1.0], decimal=2) rewards, _ = go_to_8_3(norm_treasure_env) - np.testing.assert_allclose(rewards, [2.13, -1.0], rtol=0, atol=1e-2) + np.testing.assert_almost_equal(rewards, [5.33, -1.0], decimal=2) def test_clip_wrapper(): @@ -66,26 +72,6 @@ def test_clip_wrapper(): np.testing.assert_allclose(rewards, [0.5, -1.0], rtol=0, atol=1e-2) -def test_mo_sync_wrapper(): - def make_env(env_id): - def thunk(): - env = mo_gym.make(env_id) - env = MORecordEpisodeStatistics(env, gamma=0.97) - return env - - return thunk - - num_envs = 3 - envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) - - envs.reset() - obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) - assert len(obs) == num_envs, "Number of observations do not match the number of envs" - assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" - assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" - assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" - - def test_mo_record_ep_statistic(): env = mo_gym.make("deep-sea-treasure-v0") env = MORecordEpisodeStatistics(env, gamma=0.97) @@ -98,37 +84,9 @@ def test_mo_record_ep_statistic(): assert info["episode"]["r"].shape == (2,) assert info["episode"]["dr"].shape == (2,) assert tuple(info["episode"]["r"]) == (np.float32(8.2), np.float32(-3.0)) - assert tuple(np.round(info["episode"]["dr"], 2)) == ( - np.float32(7.48), - np.float32(-2.82), - ) - assert isinstance(info["episode"]["l"], np.int32) + np.testing.assert_allclose(info["episode"]["dr"], [7.71538, -2.9109], rtol=0, atol=1e-2) + # 0 * 0.97**0 + 0 * 0.97**1 + 8.2 * 0.97**2 == 7.71538 + # -1 * 0.97**0 + -1 * 0.97**1 + -1 * 0.97**2 == -2.9109 + assert isinstance(info["episode"]["l"], int) assert info["episode"]["l"] == 3 - assert isinstance(info["episode"]["t"], np.float32) - - -def test_mo_record_ep_statistic_vector_env(): - def make_env(env_id): - def thunk(): - env = mo_gym.make(env_id) - return env - - return thunk - - num_envs = 3 - envs = MOSyncVectorEnv([make_env("deep-sea-treasure-v0") for _ in range(num_envs)]) - envs = MORecordEpisodeStatistics(envs) - - envs.reset() - terminateds = np.array([False] * num_envs) - info = {} - while not np.any(terminateds): - obs, rewards, terminateds, _, info = envs.step(envs.action_space.sample()) - - assert isinstance(info["episode"]["r"], np.ndarray) - assert isinstance(info["episode"]["dr"], np.ndarray) - # Episode records are vectorized because multiple environments - assert info["episode"]["r"].shape == (num_envs, 2) - assert info["episode"]["dr"].shape == (num_envs, 2) - assert isinstance(info["episode"]["l"], np.ndarray) - assert isinstance(info["episode"]["t"], np.ndarray) + assert isinstance(info["episode"]["t"], float)