diff --git a/benchmarking/README.md b/benchmarking/README.md index 5566a684c..2539597f7 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -186,25 +186,3 @@ where: If `your_runs_dir` contains runs for more than one algorithm, you will have to disambiguate using the `--algo` option. - -## Tuning Hyperparameters - -The hyperparameters of any algorithm in imitation can be tuned using `src/imitation/scripts/tuning.py`. -The benchmarking hyperparameter configs were generated by tuning the hyperparameters using -the search space defined in the `scripts/config/tuning.py`. - -The tuning script proceeds in two phases: -1. Tune the hyperparameters using the search space provided. -2. Re-evaluate the best hyperparameter config found in the first phase based on the maximum mean return on a separate set of seeds. Report the mean and standard deviation of these trials. - -To use it with the default search space: -```bash -python -m imitation.scripts.tuning with 'parallel_run_config.base_named_configs=[""]' -``` - -In this command: -- `` provides the default search space and settings for the specific algorithm, which is defined in the `scripts/config/tuning.py` -- `` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial|imitation|preference_comparisons|rl].py` files. For the already tuned environments, use the `_` named configs here. - -See the documentation of `scripts/tuning.py` and `scripts/parallel.py` for many other arguments that can be -provided through the command line to change the tuning behavior. diff --git a/setup.py b/setup.py index 1c2c85af6..1a76e49fb 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ IS_NOT_WINDOWS = os.name != "nt" -PARALLEL_REQUIRE = ["ray[debug,tune]~=2.0.0"] +PARALLEL_REQUIRE = ["ray[debug,tune]~=2.9.0"] ATARI_REQUIRE = [ "seals[atari]~=0.2.1", ] diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 14a8fad5b..ba9a49b40 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -1678,6 +1678,10 @@ def train( unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations)) probs = unnormalized_probs / np.sum(unnormalized_probs) shares = util.oric(probs * total_comparisons) + shares[ + shares <= 0 + ] = 1 # ensure we at least request one comparison per iteration + schedule = [initial_comparisons] + shares.tolist() print(f"Query schedule: {schedule}") diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py index f151e768e..b189895b3 100644 --- a/src/imitation/scripts/config/train_imitation.py +++ b/src/imitation/scripts/config/train_imitation.py @@ -45,6 +45,13 @@ def seals_mountain_car(): environment = dict(gym_id="seals/MountainCar-v0") bc = dict(l2_weight=0.0) dagger = dict(total_timesteps=20000) + sqil = dict(total_timesteps=1e5) + + +@train_imitation_ex.named_config +def seals_ant(): + environment = dict(gym_id="seals/Ant-v1") + sqil = dict(total_timesteps=2e6) @train_imitation_ex.named_config @@ -57,11 +64,13 @@ def cartpole(): def seals_cartpole(): environment = dict(gym_id="seals/CartPole-v0") dagger = dict(total_timesteps=20000) + sqil = dict(total_timesteps=1e5) @train_imitation_ex.named_config def pendulum(): environment = dict(gym_id="Pendulum-v1") + sqil = dict(total_timesteps=1e5) @train_imitation_ex.named_config @@ -76,6 +85,24 @@ def half_cheetah(): dagger = dict(total_timesteps=60000) +@train_imitation_ex.named_config +def seals_half_cheetah(): + environment = dict(gym_id="seals/HalfCheetah-v1") + sqil = dict(total_timesteps=2e6) + + +@train_imitation_ex.named_config +def seals_hopper(): + environment = dict(gym_id="seals/Hopper-v1") + sqil = dict(total_timesteps=2e6) + + +@train_imitation_ex.named_config +def seals_walker(): + environment = dict(gym_id="seals/Walker2d-v1") + sqil = dict(total_timesteps=2e6) + + @train_imitation_ex.named_config def humanoid(): environment = dict(gym_id="Humanoid-v2") @@ -83,7 +110,8 @@ def humanoid(): @train_imitation_ex.named_config def seals_humanoid(): - environment = dict(gym_id="seals/Humanoid-v0") + environment = dict(gym_id="seals/Humanoid-v1") + sqil = dict(total_timesteps=2e6) @train_imitation_ex.named_config diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 4d8531732..b053d3f38 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -42,6 +42,8 @@ def train_defaults(): transition_oversampling = 1 # fraction of total_comparisons that will be sampled right at the beginning initial_comparison_frac = 0.1 + # factor by which to oversample the number of epochs in the first iteration + initial_epoch_multiplier = 200.0 # fraction of sampled trajectories that will include some random actions exploration_frac = 0.0 preference_model_kwargs = {} @@ -77,7 +79,7 @@ def cartpole(): @train_preference_comparisons_ex.named_config def seals_ant(): - environment = dict(gym_id="seals/Ant-v0") + environment = dict(gym_id="seals/Ant-v1") rl = dict( batch_size=2048, rl_kwargs=dict( @@ -104,7 +106,7 @@ def half_cheetah(): @train_preference_comparisons_ex.named_config def seals_half_cheetah(): - environment = dict(gym_id="seals/HalfCheetah-v0") + environment = dict(gym_id="seals/HalfCheetah-v1") rl = dict( batch_size=512, rl_kwargs=dict( @@ -125,7 +127,7 @@ def seals_half_cheetah(): @train_preference_comparisons_ex.named_config def seals_hopper(): - environment = dict(gym_id="seals/Hopper-v0") + environment = dict(gym_id="seals/Hopper-v1") policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( @@ -151,7 +153,7 @@ def seals_hopper(): @train_preference_comparisons_ex.named_config def seals_swimmer(): - environment = dict(gym_id="seals/Swimmer-v0") + environment = dict(gym_id="seals/Swimmer-v1") policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( @@ -178,7 +180,7 @@ def seals_swimmer(): @train_preference_comparisons_ex.named_config def seals_walker(): - environment = dict(gym_id="seals/Walker2d-v0") + environment = dict(gym_id="seals/Walker2d-v1") policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( @@ -206,7 +208,7 @@ def seals_walker(): @train_preference_comparisons_ex.named_config def seals_humanoid(): locals().update(**MUJOCO_SHARED_LOCALS) - environment = dict(gym_id="seals/Humanoid-v0") + environment = dict(gym_id="seals/Humanoid-v1") total_timesteps = int(4e6) diff --git a/src/imitation/scripts/config/tuning.py b/src/imitation/scripts/config/tuning.py index 73313770a..22d1d82fb 100644 --- a/src/imitation/scripts/config/tuning.py +++ b/src/imitation/scripts/config/tuning.py @@ -2,7 +2,6 @@ import ray.tune as tune import sacred -from torch import nn from imitation.algorithms import dagger as dagger_alg from imitation.scripts.parallel import parallel_ex @@ -188,38 +187,48 @@ def pc(): parallel_run_config = dict( sacred_ex_name="train_preference_comparisons", run_name="pc_tuning", - base_named_configs=["logging.wandb_logging"], + base_named_configs=[], base_config_updates={ "environment": {"num_vec": 1}, - "demonstrations": {"source": "huggingface"}, "total_timesteps": 2e7, - "total_comparisons": 5000, - "query_schedule": "hyperbolic", - "gatherer_kwargs": {"sample": True}, + "total_comparisons": 1000, + "active_selection": True, }, search_space={ - "named_configs": [ - ["reward.normalize_output_disable"], - ], + "named_configs": ["reward.reward_ensemble"], "config_updates": { - "train": { - "policy_kwargs": { - "activation_fn": tune.choice( - [ - nn.ReLU, - ], - ), - }, + "active_selection_oversampling": tune.randint(1, 11), + "comparison_queue_size": tune.randint( + 1, 1001, + ), # upper bound determined by total_comparisons=1000 + "exploration_frac": tune.uniform(0.0, 0.5), + "fragment_length": tune.randint( + 1, 1001, + ), # trajectories are 1000 steps long + "gatherer_kwargs": { + "temperature": tune.uniform(0.0, 2.0), + "discount_factor": tune.uniform(0.95, 1.0), + "sample": tune.choice([True, False]), + }, + "initial_comparison_frac": tune.uniform(0.01, 1.0), + "num_iterations": tune.randint(1, 51), + "preference_model_kwargs": { + "noise_prob": tune.uniform(0.0, 0.1), + "discount_factor": tune.uniform(0.95, 1.0), }, - "num_iterations": tune.choice([25, 50]), - "initial_comparison_frac": tune.choice([0.1, 0.25]), + "query_schedule": tune.choice( + ["hyperbolic", "constant", "inverse_quadratic",] + ), + "trajectory_generator_kwargs": { + "switch_prob": tune.uniform(0.1, 1), + "random_prob": tune.uniform(0.1, 0.9), + }, + "transition_oversampling": tune.uniform(0.9, 2.0), "reward_trainer_kwargs": { - "epochs": tune.choice([1, 3, 6]), + "epochs": tune.randint(1, 11), }, "rl": { - "batch_size": tune.choice([512, 2048, 8192]), "rl_kwargs": { - "learning_rate": tune.loguniform(1e-5, 1e-2), "ent_coef": tune.loguniform(1e-7, 1e-3), }, }, diff --git a/src/imitation/scripts/ingredients/rl.py b/src/imitation/scripts/ingredients/rl.py index d5373c773..bf43a129f 100644 --- a/src/imitation/scripts/ingredients/rl.py +++ b/src/imitation/scripts/ingredients/rl.py @@ -98,6 +98,11 @@ def sac(): locals() # quieten flake8 +@rl_ingredient.named_config +def dqn(): + rl_cls = sb3.DQN + + def _maybe_add_relabel_buffer( rl_kwargs: Dict[str, Any], relabel_reward_fn: Optional[RewardFn] = None, diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index d5e5e2378..76a068224 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -188,13 +188,12 @@ def _ray_tune_sacred_wrapper( `ex.run`) and `reporter`. The function returns the run result. """ - def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]: + def inner(config: Mapping[str, Any]) -> Mapping[str, Any]: """Trainable function with the correct signature for `ray.tune`. Args: config: Keyword arguments for `ex.run()`, where `ex` is the `sacred.Experiment` instance associated with `sacred_ex_name`. - reporter: Callback to report progress to Ray. Returns: Result from `ray.Run` object. diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index 71363daee..428c98381 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -68,6 +68,7 @@ def train_preference_comparisons( fragment_length: int, transition_oversampling: float, initial_comparison_frac: float, + initial_epoch_multiplier: float, exploration_frac: float, trajectory_path: Optional[str], trajectory_generator_kwargs: Mapping[str, Any], @@ -106,6 +107,9 @@ def train_preference_comparisons( sampled before the rest of training begins (using the randomly initialized agent). This can be used to pretrain the reward model before the agent is trained on the learned reward. + initial_epoch_multiplier: before agent training begins, train the reward + model for this many more epochs than usual (on fragments sampled from a + random agent). exploration_frac: fraction of trajectory samples that will be created using partially random actions, rather than the current policy. Might be helpful if the learned policy explores too little and gets stuck with a wrong @@ -258,6 +262,7 @@ def train_preference_comparisons( fragment_length=fragment_length, transition_oversampling=transition_oversampling, initial_comparison_frac=initial_comparison_frac, + initial_epoch_multiplier=initial_epoch_multiplier, custom_logger=custom_logger, allow_variable_horizon=allow_variable_horizon, query_schedule=query_schedule, diff --git a/tuning/README.md b/tuning/README.md new file mode 100644 index 000000000..dee25da45 --- /dev/null +++ b/tuning/README.md @@ -0,0 +1,38 @@ +# Tuning Hyperparameters +This directory contains scripts for tuning hyperparameters for imitation learning algorithms. +Additional helper scripts allow for running multiple tuning jobs in parallel on a SLURM cluster. + +Use `tune.py` to tune hyperparameters for a single algorithm and environment using Optuna. +If you want to specify a custom algorithm and search space, add it to the dict in `hp_search_spaces.py`. + +You can tune using multiple workers in parallel by running multiple instances of `tune.py` that all point to the same journal log file (see `tune.py --help` for details). +To easily launch multiple workers on a SLURM cluster and ensure they don't conflict with each other, +use the `tune_on_slurm.py` script. +This script will launch a SLURM job array with the specified number of workers. +If you want to tune all algorithms on all environments on SLURM, use `tune_all_on_slurm.sh`. + +# Legacy Tuning Scripts + +Note: There are some legacy tuning scripts that can be used like this: + +The hyperparameters of any algorithm in imitation can be tuned using `src/imitation/scripts/tuning.py`. +The benchmarking hyperparameter configs were generated by tuning the hyperparameters using +the search space defined in the `scripts/config/tuning.py`. + +The tuning script proceeds in two phases: +1. Tune the hyperparameters using the search space provided. +2. Re-evaluate the best hyperparameter config found in the first phase + based on the maximum mean return on a separate set of seeds. + Report the mean and standard deviation of these trials. + +To use it with the default search space: +```bash +python -m imitation.scripts.tuning with 'parallel_run_config.base_named_configs=[""]' +``` + +In this command: +- `` provides the default search space and settings for the specific algorithm, which is defined in the `scripts/config/tuning.py` +- `` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial|imitation|preference_comparisons|rl].py` files. For the already tuned environments, use the `_` named configs here. + +See the documentation of `scripts/tuning.py` and `scripts/parallel.py` for many other arguments that can be +provided through the command line to change the tuning behavior. diff --git a/tuning/benchmark_analysis.ipynb b/tuning/benchmark_analysis.ipynb new file mode 100644 index 000000000..a76c39e0d --- /dev/null +++ b/tuning/benchmark_analysis.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5c06148d9ff6b57", + "metadata": { + "collapsed": false + }, + "source": [ + "This notebook loads all the optuna studies in the \"tuning\" folder and arranges them in a dataframe. It also loads the performance of the best model from the paper and the rerun results.\n", + "\n", + "It can serve as a starting point for further analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31e6f532-15c3-494a-8a3a-de25ecc1ee90", + "metadata": {}, + "outputs": [], + "source": [ + "# Load all the studies into a dataframe\n", + "\n", + "import optuna\n", + "from collections import Counter\n", + "from optuna.trial import TrialState\n", + "import pandas as pd\n", + "import numpy as np\n", + "import datetime\n", + "from pathlib import Path\n", + "\n", + "import imitation.util.sacred_file_parsing as sfp\n", + "\n", + "\n", + "experiment_log_files = list(Path().glob(\"*/*.log\"))\n", + "\n", + "experiment_log_files\n", + "\n", + "raw_study_data = []\n", + "\n", + "for log_file in experiment_log_files:\n", + " d = dict()\n", + "\n", + " d[\"logfile\"] = log_file\n", + "\n", + " study = optuna.load_study(\n", + " storage=optuna.storages.JournalStorage(\n", + " optuna.storages.JournalFileStorage(str(log_file))\n", + " ),\n", + " # in our case, we have one journal file per study so the study name can be\n", + " # inferred\n", + " study_name=None,\n", + " )\n", + " d[\"study\"] = study\n", + " d[\"study_name\"] = study.study_name\n", + "\n", + " trial_state_counter = Counter(t.state for t in study.trials)\n", + " n_completed_trials = trial_state_counter[TrialState.COMPLETE]\n", + " d[\"trials\"] = n_completed_trials\n", + " d[\"trials_running\"] = Counter(t.state for t in study.trials)[TrialState.RUNNING]\n", + " d[\"trials_failed\"] = Counter(t.state for t in study.trials)[TrialState.FAIL]\n", + " d[\"all_trials\"] = len(study.trials)\n", + "\n", + " if n_completed_trials > 0:\n", + " d[\"best_value\"] = round(study.best_trial.value, 2)\n", + "\n", + " assert \"_\" in study.study_name\n", + " study_segments = study.study_name.split(\"_\")\n", + " assert len(study_segments) > 3\n", + " tuning, algo, with_ = study_segments[:3]\n", + " assert (tuning, with_) == (\"tuning\", \"with\")\n", + "\n", + " d[\"algo\"] = algo\n", + " d[\"env\"] = \"_\".join(study_segments[3:])\n", + " d[\"best_trial_duration\"] = study.best_trial.duration\n", + " d[\"mean_duration\"] = (\n", + " sum(\n", + " [t.duration for t in study.trials if t.state == TrialState.COMPLETE],\n", + " datetime.timedelta(),\n", + " )\n", + " / n_completed_trials\n", + " )\n", + "\n", + " reruns_folder = log_file.parent / \"reruns\"\n", + " rerun_results = [\n", + " round(run[\"result\"][\"imit_stats\"][\"monitor_return_mean\"], 2)\n", + " for conf, run in sfp.find_sacred_runs(reruns_folder, only_completed_runs=True)\n", + " ]\n", + " d[\"rerun_values\"] = rerun_results\n", + "\n", + " raw_study_data.append(d)\n", + "\n", + "study_data = pd.DataFrame(raw_study_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b604bc7e-2e61-4f7f-acfe-87b57e8a2f5a", + "metadata": {}, + "outputs": [], + "source": [ + "# Add performance of the best model from the paper\n", + "import pandas as pd\n", + "\n", + "environments = [\n", + " \"seals_ant\",\n", + " \"seals_half_cheetah\",\n", + " \"seals_hopper\",\n", + " \"seals_swimmer\",\n", + " \"seals_walker\",\n", + " \"seals_humanoid\",\n", + " \"seals_cartpole\",\n", + " \"pendulum\",\n", + " \"seals_mountain_car\",\n", + "]\n", + "\n", + "pc_paper_700 = dict(\n", + " seals_ant=200,\n", + " seals_half_cheetah=4700,\n", + " seals_hopper=4500,\n", + " seals_swimmer=170,\n", + " seals_walker=4900,\n", + " seals_humanoid=\"-\",\n", + " seals_cartpole=\"-\",\n", + " pendulum=1300,\n", + " seals_mountain_car=\"-\",\n", + ")\n", + "\n", + "pc_paper_1400 = dict(\n", + " seals_ant=100,\n", + " seals_half_cheetah=5600,\n", + " seals_hopper=4500,\n", + " seals_swimmer=175,\n", + " seals_walker=5900,\n", + " seals_humanoid=\"-\",\n", + " seals_cartpole=\"-\",\n", + " pendulum=750,\n", + " seals_mountain_car=\"-\",\n", + ")\n", + "\n", + "rl_paper = dict(\n", + " seals_ant=16,\n", + " seals_half_cheetah=420,\n", + " seals_hopper=4210,\n", + " seals_swimmer=175,\n", + " seals_walker=5370,\n", + " seals_humanoid=\"-\",\n", + " seals_cartpole=\"-\",\n", + " pendulum=1300,\n", + " seals_mountain_car=\"-\",\n", + ")\n", + "\n", + "rl_ours = dict(\n", + " seals_ant=3034,\n", + " seals_half_cheetah=1675.76,\n", + " seals_hopper=203.45,\n", + " seals_swimmer=292.84,\n", + " seals_walker=2465.56,\n", + " seals_humanoid=3224.12,\n", + " seals_cartpole=500.00,\n", + " pendulum=-189.25,\n", + " seals_mountain_car=-97.00,\n", + ")\n", + "\n", + "for algo, values_by_env in dict(\n", + " pc_paper_700=pc_paper_700,\n", + " pc_paper_1400=pc_paper_1400,\n", + " rl_paper=rl_paper,\n", + " rl_ours=rl_ours,\n", + ").items():\n", + " for env, value in values_by_env.items():\n", + " if value == \"-\":\n", + " continue\n", + " raw_study_data.append(\n", + " dict(\n", + " algo=algo,\n", + " env=env,\n", + " best_value=value,\n", + " )\n", + " )\n", + "\n", + "study_data = pd.DataFrame(raw_study_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e9ae5ca-5002-411b-beaf-cb98eb12f54c", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "\n", + "print(\"Benchmark Data\")\n", + "display(study_data[[\"algo\", \"env\", \"best_value\"]])\n", + "\n", + "print(\"Rerun Data\")\n", + "display(\n", + " study_data[[\"algo\", \"env\", \"best_value\", \"rerun_values\"]][\n", + " study_data[\"rerun_values\"].map(np.std) > 0\n", + " ]\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py new file mode 100644 index 000000000..5a4e7db1d --- /dev/null +++ b/tuning/hp_search_spaces.py @@ -0,0 +1,321 @@ +"""Definitions for search spaces used when tuning hyperparameters. + +To add a new search space, add a new entry to the `objectives_by_algo` dict. +The key should be the name of the algorithm, and the value should be a RunSacredAsTrial +object that specifies what sacred experiment to run and how to sample hyperparameters. + +Note: you could specify multiple search spaces for the same algorithm. Make sure to give +them different names, and then specify the name when running the tuning script. +For example, to use different spaces for different classes of environments, you could +have a "pc-classic-control" and a "pc-mujoco" search space. +Note: avoid using underscores in the search space names, as they are used to separate +the algorithm name from the search space name when inferring the algorithm name from +the study name. +""" + +import dataclasses +from typing import Any, Callable, Dict, List, Mapping, Optional + +import optuna +import sacred + +import imitation.scripts.train_imitation +import imitation.scripts.train_preference_comparisons as train_pc_script + + +@dataclasses.dataclass +class RunSacredAsTrial: + """Runs a sacred experiment as an optuna trial. + + Assumes that the sacred experiment returns a dict with a key 'imit_stats' that + contains a dict with a key 'monitor_return_mean'. + """ + + """The sacred experiment to run.""" + sacred_ex: sacred.Experiment + + """A function that returns a list of named configs to pass to sacred.run.""" + suggest_named_configs: Callable[[optuna.Trial], List[str]] + + """A function that returns a dict of config updates to pass to sacred.run.""" + suggest_config_updates: Callable[[optuna.Trial], Mapping[str, Any]] + + """Command name to pass to sacred.run.""" + command_name: Optional[str] = None + + def __call__( + self, + trial: optuna.Trial, + run_options: Dict, + extra_named_configs: List[str], + ) -> float: + """Run the sacred experiment and return the performance. + + Args: + trial: The optuna trial to sample hyperparameters for. + run_options: Options to pass to sacred.run(options=). + extra_named_configs: Additional named configs to pass to sacred.run. + + Returns: + The performance of the trial. + + Raises: + RuntimeError: If the trial fails. + """ + config_updates = self.suggest_config_updates(trial) + named_configs = self.suggest_named_configs(trial) + extra_named_configs + + trial.set_user_attr("config_updates", config_updates) + trial.set_user_attr("named_configs", named_configs) + trial.set_user_attr("command_name", self.command_name) + + experiment: sacred.Experiment = self.sacred_ex + result = experiment.run( + command_name=self.command_name, + config_updates=config_updates, + named_configs=named_configs, + options=run_options, + ) + if result.status != "COMPLETED": + raise RuntimeError( + f"Trial failed with {result.fail_trace()} and status {result.status}.", + ) + return result.result["imit_stats"]["monitor_return_mean"] + + +"""A mapping from algorithm names to functions that run the algorithm as an optuna +trial.""" +objectives_by_algo = dict( + pc=RunSacredAsTrial( + sacred_ex=train_pc_script.train_preference_comparisons_ex, + suggest_named_configs=lambda _: ["reward.reward_ensemble"], + suggest_config_updates=lambda trial: { + "seed": trial.number, + "environment": {"num_vec": 8}, + "total_timesteps": 2e7, + "total_comparisons": 1000, + "active_selection": True, + "active_selection_oversampling": trial.suggest_int( + "active_selection_oversampling", + 1, + 11, + ), + "comparison_queue_size": trial.suggest_int( + "comparison_queue_size", + 1, + 1001, + ), # upper bound determined by total_comparisons=1000 + "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5), + "fragment_length": trial.suggest_int( + "fragment_length", + 1, + 1001, + ), # trajectories are 1000 steps long + "gatherer_kwargs": { + "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0), + "discount_factor": trial.suggest_float( + "gatherer_discount_factor", + 0.95, + 1.0, + ), + "sample": trial.suggest_categorical("gatherer_sample", [True, False]), + }, + "initial_epoch_multiplier": trial.suggest_float( + "initial_epoch_multiplier", + 1, + 200.0, + ), + "initial_comparison_frac": trial.suggest_float( + "initial_comparison_frac", + 0.01, + 1.0, + ), + "num_iterations": trial.suggest_int("num_iterations", 1, 51), + "preference_model_kwargs": { + "noise_prob": trial.suggest_float( + "preference_model_noise_prob", + 0.0, + 0.1, + ), + "discount_factor": trial.suggest_float( + "preference_model_discount_factor", + 0.95, + 1.0, + ), + }, + "query_schedule": trial.suggest_categorical( + "query_schedule", + [ + "hyperbolic", + "constant", + "inverse_quadratic", + ], + ), + "trajectory_generator_kwargs": { + "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1), + "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9), + }, + "transition_oversampling": trial.suggest_float( + "transition_oversampling", + 0.9, + 2.0, + ), + "reward_trainer_kwargs": { + "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11), + }, + "rl": { + "rl_kwargs": { + "ent_coef": trial.suggest_float( + "rl_ent_coef", + 1e-7, + 1e-3, + log=True, + ), + }, + }, + }, + ), + pc_classic_control=RunSacredAsTrial( + sacred_ex=train_pc_script.train_preference_comparisons_ex, + suggest_named_configs=lambda _: ["reward.reward_ensemble"], + suggest_config_updates=lambda trial: { + "seed": trial.number, + "environment": {"num_vec": 8}, + "total_timesteps": 1e6, + "total_comparisons": 1000, + "active_selection": True, + "active_selection_oversampling": trial.suggest_int( + "active_selection_oversampling", + 1, + 11, + ), + "comparison_queue_size": trial.suggest_int( + "comparison_queue_size", + 1, + 1001, + ), # upper bound determined by total_comparisons=1000 + "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5), + "fragment_length": trial.suggest_int( + "fragment_length", + 1, + 201, + ), # trajectories are 1000 steps long + "gatherer_kwargs": { + "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0), + "discount_factor": trial.suggest_float( + "gatherer_discount_factor", + 0.95, + 1.0, + ), + "sample": trial.suggest_categorical("gatherer_sample", [True, False]), + }, + "initial_epoch_multiplier": trial.suggest_float( + "initial_epoch_multiplier", + 1, + 200.0, + ), + "initial_comparison_frac": trial.suggest_float( + "initial_comparison_frac", + 0.01, + 1.0, + ), + "num_iterations": trial.suggest_int("num_iterations", 1, 51), + "preference_model_kwargs": { + "noise_prob": trial.suggest_float( + "preference_model_noise_prob", + 0.0, + 0.1, + ), + "discount_factor": trial.suggest_float( + "preference_model_discount_factor", + 0.95, + 1.0, + ), + }, + "query_schedule": trial.suggest_categorical( + "query_schedule", + [ + "hyperbolic", + "constant", + "inverse_quadratic", + ], + ), + "trajectory_generator_kwargs": { + "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1), + "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9), + }, + "transition_oversampling": trial.suggest_float( + "transition_oversampling", + 0.9, + 2.0, + ), + "reward_trainer_kwargs": { + "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11), + }, + "rl": { + "rl_kwargs": { + "ent_coef": trial.suggest_float( + "rl_ent_coef", + 1e-7, + 1e-3, + log=True, + ), + }, + }, + }, + ), + sqil=RunSacredAsTrial( + sacred_ex=imitation.scripts.train_imitation.train_imitation_ex, + command_name="sqil", + suggest_named_configs=lambda _: ["rl.dqn"], + suggest_config_updates=lambda trial: { + "seed": trial.number, + "demonstrations": { + "n_expert_demos": 100, + "source": "generated", + }, + "rl": { + "rl_kwargs": { + "learning_rate": trial.suggest_float( + "learning_rate", + 1e-6, + 1e-2, + log=True, + ), + "buffer_size": trial.suggest_int("buffer_size", 1000, 100000), + "learning_starts": trial.suggest_int( + "learning_starts", + 1000, + 10000, + ), + "batch_size": trial.suggest_int("batch_size", 32, 128), + "tau": trial.suggest_float("tau", 0.0, 1.0), + "gamma": trial.suggest_float("gamma", 0.9, 0.999), + "train_freq": trial.suggest_int("train_freq", 1, 40), + "gradient_steps": trial.suggest_int("gradient_steps", 1, 10), + "target_update_interval": trial.suggest_int( + "target_update_interval", + 1, + 10000, + ), + "exploration_fraction": trial.suggest_float( + "exploration_fraction", + 0.01, + 0.5, + ), + "exploration_final_eps": trial.suggest_float( + "exploration_final_eps", + 0.01, + 1.0, + ), + "exploration_initial_eps": trial.suggest_float( + "exploration_initial_eps", + 0.01, + 0.5, + ), + "max_grad_norm": trial.suggest_float("max_grad_norm", 0.1, 10.0), + }, + }, + }, + ), +) diff --git a/tuning/rerun_best_trial.py b/tuning/rerun_best_trial.py new file mode 100644 index 000000000..7b878a02e --- /dev/null +++ b/tuning/rerun_best_trial.py @@ -0,0 +1,86 @@ +"""Script to re-run the best trials from a previous hyperparameter tuning run.""" +import argparse +import random + +import hp_search_spaces +import optuna +import sacred + + +def make_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Re-run the best trial from a previous tuning run.", + epilog="Example usage:\npython rerun_best_trials.py tuning_run.json\n", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--algo", + type=str, + default=None, + choices=hp_search_spaces.objectives_by_algo.keys(), + help="The algorithm that has been tuned. " + "Can usually be deduced from the study name.", + ) + parser.add_argument( + "journal_log", + type=str, + help="The optuna journal file of the previous tuning run.", + ) + parser.add_argument( + "--seed", + type=int, + default=random.randint(0, 2**32 - 1), + help="The seed to use for the re-run. A random seed is used by default.", + ) + return parser + + +def infer_algo_name(study: optuna.Study) -> str: + """Infer the algo name from the study name. + + Assumes that the study name is of the form "tuning_{algo}_with_{named_configs}". + + Args: + study: The optuna study. + + Returns: + algo name + """ + assert study.study_name.startswith("tuning_") + assert "_with_" in study.study_name + return study.study_name[len("tuning_") :].split("_with_")[0] + + +def main(): + parser = make_parser() + args = parser.parse_args() + study: optuna.Study = optuna.load_study( + storage=optuna.storages.JournalStorage( + optuna.storages.JournalFileStorage(args.journal_log), + ), + # in our case, we have one journal file per study so the study name can be + # inferred + study_name=None, + ) + trial = study.best_trial + + algo_name = args.algo or infer_algo_name(study) + sacred_experiment: sacred.Experiment = hp_search_spaces.objectives_by_algo[ + algo_name + ].sacred_ex + + config_updates = trial.user_attrs["config_updates"].copy() + config_updates["seed"] = args.seed + result = sacred_experiment.run( + config_updates=config_updates, + named_configs=trial.user_attrs["named_configs"], + options={"--name": study.study_name, "--file_storage": "sacred"}, + ) + if result.status != "COMPLETED": + raise RuntimeError( + f"Trial failed with {result.fail_trace()} and status {result.status}.", + ) + + +if __name__ == "__main__": + main() diff --git a/tuning/rerun_on_slurm.sh b/tuning/rerun_on_slurm.sh new file mode 100644 index 000000000..c8c32a8e4 --- /dev/null +++ b/tuning/rerun_on_slurm.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH --array=1-5 +# Avoid cluttering the root directory with log files: +#SBATCH --output=%x/reruns/%a/sbatch_cout.txt +#SBATCH --cpus-per-task=8 +#SBATCH --gpus=0 +#SBATCH --mem=8gb +#SBATCH --time=70:00:00 +#SBATCH --qos=scavenger +#SBATCH --export=ALL + +# DESCRIPTION: +# Reruns the top trials from a previous hyperparameter sweep. + +# PREREQUISITES: +# A folder with a hyperparameter sweep as started by tune_on_slurm.sh. + +# USAGE: +# sbatch --job-name= rerun_on_slurm.sh +# +# Picks the best trial from the optuna study in and reruns them with +# the same hyperparameters but different seeds. + +# OUTPUT: +# Creates a sub-folder in the given tune_folder for each worker: +# /reruns/ +# The output of each worker is written to a cout.txt. + +# shellcheck disable=SC1090 +source "/nas/ucb/$(whoami)/imitation/venv/bin/activate" + +worker_dir="$SLURM_JOB_NAME/reruns/$SLURM_ARRAY_TASK_ID/" + +if [ -f "$worker_dir/cout.txt" ]; then + # This indicates that there is already a worker running in that directory. + # So we better abort! + echo "There is already a worker running in this directory. \ + Try different seeds by picking a different array range!" + exit 1 +else + # Note: we run each worker in a separate working directory to avoid race + # conditions when writing sacred outputs to the same folder. + mkdir -p "$worker_dir" +fi + +cd "$worker_dir" || exit + +srun --output="cout.txt" python ../../../rerun_best_trial.py "../../optuna_study.log" --seed "$SLURM_ARRAY_TASK_ID" diff --git a/tuning/tune.py b/tuning/tune.py new file mode 100644 index 000000000..4a2e710c5 --- /dev/null +++ b/tuning/tune.py @@ -0,0 +1,86 @@ +"""Script to tune hyperparameters for imitation learning algorithms using optuna.""" +import argparse + +import optuna +from hp_search_spaces import objectives_by_algo + + +def make_parser() -> argparse.ArgumentParser: + example_usage = "python tune.py pc seals_swimmer" + possible_named_configs = "\n".join( + f" - {algo}: {', '.join(objective.sacred_ex.named_configs.keys())}" + for algo, objective in objectives_by_algo.items() + ) + + parser = argparse.ArgumentParser( + description="Tune hyperparameters for imitation learning algorithms.", + epilog=f"Example usage:\n{example_usage}\n\n" + f"Possible named configs:\n{possible_named_configs}", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "algo", + type=str, + default="pc", + choices=objectives_by_algo.keys(), + help="What algorithm to tune.", + ) + parser.add_argument( + "named_configs", + type=str, + nargs="+", + default=[], + help="Additional named configs to pass to the sacred experiment. " + "Use this to select the environment to tune on.", + ) + parser.add_argument( + "--num_trials", + type=int, + default=100, + help="Number of trials to run.", + ) + parser.add_argument( + "-j", + "--journal-log", + type=str, + default=None, + help="A journal file to synchronize multiple instances of this script. " + "Works on NFS storage.", + ) + return parser + + +def make_study(args: argparse.Namespace) -> optuna.Study: + if args.journal_log is not None: + storage = optuna.storages.JournalStorage( + optuna.storages.JournalFileStorage(args.journal_log), + ) + else: + storage = None + + return optuna.create_study( + study_name=f"tuning_{args.algo}_with_{'_'.join(args.named_configs)}", + storage=storage, + load_if_exists=True, + direction="maximize", + ) + + +def main(): + parser = make_parser() + args = parser.parse_args() + study = make_study(args) + + study.optimize( + lambda trial: objectives_by_algo[args.algo]( + trial, + run_options={"--name": study.study_name, "--file_storage": "sacred"}, + extra_named_configs=args.named_configs, + ), + callbacks=[optuna.study.MaxTrialsCallback(args.num_trials)], + gc_after_trial=True, + ) + + +if __name__ == "__main__": + main() diff --git a/tuning/tune_all_on_slurm.sh b/tuning/tune_all_on_slurm.sh new file mode 100644 index 000000000..ce25231b1 --- /dev/null +++ b/tuning/tune_all_on_slurm.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +sbatch --job-name=tuning_pc_on_cartpole tune_on_slurm.sh pc cartpole +sbatch --job-name=tuning_pc_on_seals_ant tune_on_slurm.sh pc seals_ant +sbatch --job-name=tuning_pc_on_seals_half_cheetah --mem=16gb tune_on_slurm.sh pc seals_half_cheetah +sbatch --job-name=tuning_pc_on_seals_hopper tune_on_slurm.sh pc seals_hopper +sbatch --job-name=tuning_pc_on_seals_swimmer tune_on_slurm.sh pc seals_swimmer +sbatch --job-name=tuning_pc_on_seals_walker tune_on_slurm.sh pc seals_walker +sbatch --job-name=tuning_pc_on_seals_humanoid --mem=32gb tune_on_slurm.sh pc seals_humanoid +sbatch --job-name=tuning_pc_on_seals_cartpole tune_on_slurm.sh pc seals_cartpole +sbatch --job-name=tuning_pc_on_pendulum tune_on_slurm.sh pc pendulum +sbatch --job-name=tuning_pc_on_seals_mountain_car tune_on_slurm.sh pc seals_mountain_car + +sbatch --job-name=tuning_sqil_on_seals_mountain_car tune_on_slurm.sh sqil seals_mountain_car +sbatch --job-name=tuning_sqil_on_seals_cartpole tune_on_slurm.sh sqil seals_cartpole diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh new file mode 100644 index 000000000..004228c59 --- /dev/null +++ b/tuning/tune_on_slurm.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#SBATCH --array=1-100 +# Avoid cluttering the root directory with log files: +#SBATCH --output=%x/%a/sbatch_cout.txt +#SBATCH --cpus-per-task=8 +#SBATCH --gpus=0 +#SBATCH --mem=8gb +#SBATCH --time=70:00:00 +#SBATCH --qos=scavenger +#SBATCH --export=ALL + +# DESCRIPTION: +# This script is used to tune the hyperparameters of an algorithm on a given +# environment in parallel on a SLURM cluster with 400 trials and 100 workers. + +# PREREQUISITES: +# This script assumes that you set up imitation in your NAS home directory and +# installed it in a venv located in the imitation directory. +# /nas/ucb/(your username)/imitation/venv/ +# Do this by running the following commands: +# cd /nas/ucb/(your username)/ +# git clone https://github.com/HumanCompatibleAI/imitation.git +# srun python3 -m venv venv +# source venv/bin/activate +# srun pip install -e . +# It is important to set up the venv using srun to ensure that the venv is working +# properly on the compute nodes. + +# USAGE: +# Run this script with sbatch and pass it the algorithm and the environment +# named-config. For example, to tune PC on CartPole, run: +# sbatch --job-name=tuning_pc_on_cartpole tune_on_slurm.sh pc cartpole +# To change the number of workers, change the --array parameter above +# or pass the --array argument to sbatch. +# To change the number of trials, change the --num_trials parameter below. +# Supported are all algorithms and environments that are supported by the tune.py +# Run tune.py --help for more information. + +# OUTPUT: +# This script creates a folder with the name of the SLURM job a numbered sub-folder for +# each worker: / +# The main folder contains the optuna journal .log for synchronizing the workers. +# It is suitable to place this log on a nfs drive shared among all workers. +# Each worker is executed within it's own sub-folder to ensure that their outputs +# do not conflict with each other. +# The output of each worker is written to a cout.txt. +# The output of the sbatch command is written to sbatch_cout.txt. + +# CONTINUING A TUNING RUN: +# Often it is desirable to continue an existing job or add more workers to it while it +# is running. Just run run this batch job again but change the --array parameter to +# ensure that the new workers do not conflict with the old ones. E.g. if you first ran +# the batch script with --array=1-100 (the default), a subsequent run should be launched +# with the --array=101-150 (for another 50 workers). For this you do not need to modify +# this file. You can pass it to sbatch to override. + +# shellcheck disable=SC1090 +source "/nas/ucb/$(whoami)/imitation/venv/bin/activate" + +if [ -f "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID/cout.txt" ]; then + # Note: this will just be written to sbatch_cout.txt and not to cout.txt to avoid + # overriding existing cout.txt files. Unfortunately sbatch won't print this for us + # so it is not very useful information. + echo "The study folder for $SLURM_JOB_NAME already contains a folder for job $SLURM_ARRAY_TASK_ID!" + echo "Are you trying to continue on an existing study? Then adapt the sbatch array range!" + echo "E.g. if the highest folder number in \"$SLURM_JOB_NAME/\" is 100 and you want to continue the study with another 50 runners, start this script using \"sbatch --job-name=$SLURM_JOB_NAME --array=101-150 tune_on_slurm.sh $1 $2\"" + exit 1 +else + # Note: we run each worker in a separate working directory to avoid race + # conditions when writing sacred outputs to the same folder. + mkdir -p "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID" +fi + +cd "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID" || exit + +srun --output=cout.txt python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"