diff --git a/RL-1.ipynb b/RL-1.ipynb new file mode 100644 index 0000000..b2b4035 --- /dev/null +++ b/RL-1.ipynb @@ -0,0 +1,396 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import gym" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+\n", + "|R: | : :\u001b[35mG\u001b[0m|\n", + "| : | : : |\n", + "| : : : : |\n", + "| | : | : |\n", + "|Y| : |\u001b[34;1mB\u001b[0m:\u001b[43m \u001b[0m|\n", + "+---------+\n", + "\n" + ] + } + ], + "source": [ + "env = gym.make(\"Taxi-v3\").env\n", + "env.render()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.reset()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+\n", + "|\u001b[43mR\u001b[0m: | : :\u001b[34;1mG\u001b[0m|\n", + "| : | : : |\n", + "| : : : : |\n", + "| | : | : |\n", + "|Y| : |\u001b[35mB\u001b[0m: |\n", + "+---------+\n", + "\n" + ] + } + ], + "source": [ + "env.render()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+\n", + "|R: | : :\u001b[35mG\u001b[0m|\n", + "| : |\u001b[43m \u001b[0m: : |\n", + "| : : : : |\n", + "| | : | : |\n", + "|\u001b[34;1mY\u001b[0m| : |B: |\n", + "+---------+\n", + "\n", + "Action SpaceDiscrete(6)\n", + "Action SpaceDiscrete(500)\n" + ] + } + ], + "source": [ + "env.reset()\n", + "env.render()\n", + "print(\"Action Space{}\".format(env.action_space))\n", + "print(\"Action Space{}\".format(env.observation_space))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "State : 328\n", + "+---------+\n", + "|R: | : :\u001b[35mG\u001b[0m|\n", + "| : |\u001b[43m \u001b[0m: : |\n", + "| : : : : |\n", + "| | : | : |\n", + "|\u001b[34;1mY\u001b[0m| : |B: |\n", + "+---------+\n", + "\n" + ] + } + ], + "source": [ + "state = env.encode(3,1,2,0)\n", + "print(\"State :\", state)\n", + "env.render()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: [(1.0, 428, -1, False)],\n", + " 1: [(1.0, 228, -1, False)],\n", + " 2: [(1.0, 348, -1, False)],\n", + " 3: [(1.0, 328, -1, False)],\n", + " 4: [(1.0, 328, -10, False)],\n", + " 5: [(1.0, 328, -10, False)]}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.P[328]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timesteps taken: 1079\n", + "Penalties incurred: 354\n" + ] + } + ], + "source": [ + "env.s = 328\n", + "epochs = 0\n", + "penalties, reward = 0,0\n", + "frames= []\n", + "done = False\n", + "while not done:\n", + " action = env.action_space.sample()\n", + " state, reward, done, info = env.step(action)\n", + " if reward == -10:\n", + " penalties += 1\n", + " \n", + " frames.append({ \n", + " 'frame': env.render(mode='ansi'),\n", + " 'state': state,\n", + " 'action': action,\n", + " 'reward': reward\n", + " }\n", + " )\n", + " epochs +=1\n", + "print(\"Timesteps taken: {}\".format(epochs))\n", + "print(\"Penalties incurred: {}\".format(penalties))\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timestep: 1079\n", + "State: 0\n", + "Action: 5\n", + "Reward: 20\n" + ] + } + ], + "source": [ + "from IPython.display import clear_output\n", + "from time import sleep\n", + "\n", + "def print_frames(frames):\n", + " for i, frame in enumerate(frames):\n", + " clear_output(wait=True)\n", + "# print(frame['frame'].getvalue())\n", + " print(f\"Timestep: {i + 1}\")\n", + " print(f\"State: {frame['state']}\")\n", + " print(f\"Action: {frame['action']}\")\n", + " print(f\"Reward: {frame['reward']}\")\n", + " sleep(.1)\n", + " \n", + "print_frames(frames)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "q_table = np.zeros([env.observation_space.n, env.action_space.n])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Episode: 100000\n", + "Training finished.\n", + "\n" + ] + } + ], + "source": [ + "\"\"\" Training The Agent \"\"\"\n", + "import random\n", + "from IPython.display import clear_output\n", + "\n", + "alpha = 0.1\n", + "gamma = 0.6\n", + "epsilon = 0.1\n", + "all_epochs = []\n", + "all_penalties = []\n", + "for i in range(1, 100001):\n", + " state = env.reset()\n", + " epochs, penalties, reward = 0, 0, 0\n", + " done = False\n", + " \n", + " while not done:\n", + " if random.uniform(0,1) < epsilon:\n", + " action = env.action_space.sample() #Explore action space\n", + " else:\n", + " action = np.argmax(q_table[state]) #Exploit learned values\n", + " \n", + " next_state, reward, done, info = env.step(action) \n", + " \n", + " old_value = q_table[state, action]\n", + " next_max = np.max(q_table[next_state])\n", + " \n", + " new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)\n", + " q_table[state, action] = new_value\n", + "\n", + " if reward == -10:\n", + " penalties += 1\n", + "\n", + " state = next_state\n", + " epochs += 1\n", + " \n", + " if i % 100 == 0:\n", + " clear_output(wait=True)\n", + " print(f\"Episode: {i}\")\n", + "\n", + "print(\"Training finished.\\n\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ -2.40943541, -2.27325184, -2.41396927, -2.36299859,\n", + " -10.52639717, -10.68579624])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q_table[328]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Results after 500 episodes:\n", + "Average timesteps per episode: 12.906\n", + "Average penalties per episode: 0.0\n" + ] + } + ], + "source": [ + "\"\"\"Evaluate agent's performance after Q-learning\"\"\"\n", + "\n", + "total_epochs, total_penalties = 0, 0\n", + "episodes = 500\n", + "\n", + "for _ in range(episodes):\n", + " state = env.reset()\n", + " epochs, penalties, reward = 0, 0, 0\n", + " \n", + " done = False\n", + " \n", + " while not done:\n", + " action = np.argmax(q_table[state])\n", + " state, reward, done, info = env.step(action)\n", + "\n", + " if reward == -10:\n", + " penalties += 1\n", + "\n", + " epochs += 1\n", + "\n", + " total_penalties += penalties\n", + " total_epochs += epochs\n", + "\n", + "print(f\"Results after {episodes} episodes:\")\n", + "print(f\"Average timesteps per episode: {total_epochs / episodes}\")\n", + "print(f\"Average penalties per episode: {total_penalties / episodes}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Rewards.py b/Rewards.py new file mode 100644 index 0000000..6174d33 --- /dev/null +++ b/Rewards.py @@ -0,0 +1,161 @@ +""" +Evaluate an inferred reward function by using it to train a policy in the original env. +""" + +import argparse +import datetime + +import cv2 +import gym +import numpy as np +import matplotlib.pyplot as plt +import tensorflow as tf + +from sacred import Experiment +from sacred.observers import FileStorageObserver, RunObserver + +from stable_baselines import SAC +from stable_baselines.sac.policies import MlpPolicy as MlpPolicySac + +from deep_rlsp.util.results import Artifact, FileExperimentResults +from deep_rlsp.model import StateVAE +from deep_rlsp.envs.reward_wrapper import LatentSpaceRewardWrapper +from deep_rlsp.util.video import render_mujoco_from_obs +from deep_rlsp.util.helper import get_trajectory, evaluate_policy +from deep_rlsp.model.mujoco_debug_models import MujocoDebugFeatures, PendulumDynamics +from deep_rlsp.solvers import get_sac + +# changes the run _id and thereby the path that the FileStorageObserver +# writes the results +# cf. https://github.com/IDSIA/sacred/issues/174 +class SetID(RunObserver): + priority = 50 # very high priority to set id + + def started_event( + self, ex_info, command, host_info, start_time, config, meta_info, _id + ): + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + label = config["experiment_folder"].strip("/").split("/")[-1] + custom_id = "{}_{}".format(timestamp, label) + return custom_id # started_event returns the _run._id + + +ex = Experiment("mujoco-eval") +ex.observers = [SetID(), FileStorageObserver.create("results/mujoco/eval")] + + +def print_rollout(env, policy, latent_space, decode=False): + state = env.reset() + done = False + while not done: + a, _ = policy.predict(state, deterministic=False) + state, reward, done, info = env.step(a) + if decode: + obs = latent_space.decoder(state) + else: + obs = state + print("action", a) + print("obs", obs) + print("reward", reward) + + +@ex.config +def config(): + experiment_folder = None # noqa:F841 + iteration = -1 # noqa:F841 + + +@ex.automain +def main(_run, experiment_folder, iteration, seed): + ex = FileExperimentResults(experiment_folder) + env_id = ex.config["env_id"] + env = gym.make(env_id) + + if env_id == "InvertedPendulum-v2": + iterations = int(6e4) + else: + iterations = int(2e6) + + label = experiment_folder.strip("/").split("/")[-1] + + if ex.config["debug_handcoded_features"]: + latent_space = MujocoDebugFeatures(env) + else: + graph_latent = tf.Graph() + latent_model_path = ex.info["latent_model_checkpoint"] + with graph_latent.as_default(): + latent_space = StateVAE.restore(latent_model_path) + + r_inferred = ex.info["inferred_rewards"][iteration] + r_inferred /= np.linalg.norm(r_inferred) + + print("r_inferred") + print(r_inferred) + if env_id.startswith("Fetch"): + env_has_task_reward = True + inferred_weight = 0.1 + else: + env_has_task_reward = False + inferred_weight = None + + env_inferred = LatentSpaceRewardWrapper( + env, + latent_space, + r_inferred, + inferred_weight=inferred_weight, + use_task_reward=env_has_task_reward, + ) + + policy_inferred = get_sac(env_inferred) + policy_inferred.learn(total_timesteps=iterations, log_interval=10) + with Artifact(f"policy.zip", None, _run) as f: + policy_inferred.save(f) + + print_rollout(env_inferred, policy_inferred, latent_space) + + N = 10 + true_reward_obtained = evaluate_policy(env, policy_inferred, N) + print("Inferred reward policy: true return", true_reward_obtained) + if env_has_task_reward: + env.use_penalty = False + task_reward_obtained = evaluate_policy(env, policy_inferred, N) + print("Inferred reward policy: task return", task_reward_obtained) + env.use_penalty = True + with Artifact(f"video.mp4", None, _run) as f: + inferred_reward_obtained = evaluate_policy( + env_inferred, policy_inferred, N, video_out=f + ) + print("Inferred reward policy: inferred return", inferred_reward_obtained) + + good_policy_path = ex.config["good_policy_path"] + if good_policy_path is not None: + true_reward_policy = SAC.load(good_policy_path) + good_policy_true_reward_obtained = evaluate_policy(env, true_reward_policy, N) + print("True reward policy: true return", good_policy_true_reward_obtained) + if env_has_task_reward: + env.use_penalty = False + good_policy_task_reward_obtained = evaluate_policy( + env, true_reward_policy, N + ) + print("True reward policy: task return", good_policy_task_reward_obtained) + env.use_penalty = True + good_policy_inferred_reward_obtained = evaluate_policy( + env_inferred, true_reward_policy, N + ) + print( + "True reward policy: inferred return", good_policy_inferred_reward_obtained + ) + + random_policy = SAC(MlpPolicySac, env_inferred, verbose=1) + random_policy_true_reward_obtained = evaluate_policy(env, random_policy, N) + print("Random policy: true return", random_policy_true_reward_obtained) + if env_has_task_reward: + env.use_penalty = False + random_policy_task_reward_obtained = evaluate_policy(env, random_policy, N) + print("Random reward policy: task return", random_policy_task_reward_obtained) + env.use_penalty = True + random_policy_inferred_reward_obtained = evaluate_policy( + env_inferred, random_policy, N + ) + print("Random policy: inferred return", random_policy_inferred_reward_obtained) + print()