diff --git a/RL-1.ipynb b/RL-1.ipynb
new file mode 100644
index 0000000..b2b4035
--- /dev/null
+++ b/RL-1.ipynb
@@ -0,0 +1,396 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gym"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------+\n",
+      "|R: | : :\u001b[35mG\u001b[0m|\n",
+      "| : | : : |\n",
+      "| : : : : |\n",
+      "| | : | : |\n",
+      "|Y| : |\u001b[34;1mB\u001b[0m:\u001b[43m \u001b[0m|\n",
+      "+---------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "env = gym.make(\"Taxi-v3\").env\n",
+    "env.render()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.reset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------+\n",
+      "|\u001b[43mR\u001b[0m: | : :\u001b[34;1mG\u001b[0m|\n",
+      "| : | : : |\n",
+      "| : : : : |\n",
+      "| | : | : |\n",
+      "|Y| : |\u001b[35mB\u001b[0m: |\n",
+      "+---------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "env.render()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------+\n",
+      "|R: | : :\u001b[35mG\u001b[0m|\n",
+      "| : |\u001b[43m \u001b[0m: : |\n",
+      "| : : : : |\n",
+      "| | : | : |\n",
+      "|\u001b[34;1mY\u001b[0m| : |B: |\n",
+      "+---------+\n",
+      "\n",
+      "Action SpaceDiscrete(6)\n",
+      "Action SpaceDiscrete(500)\n"
+     ]
+    }
+   ],
+   "source": [
+    "env.reset()\n",
+    "env.render()\n",
+    "print(\"Action Space{}\".format(env.action_space))\n",
+    "print(\"Action Space{}\".format(env.observation_space))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "State : 328\n",
+      "+---------+\n",
+      "|R: | : :\u001b[35mG\u001b[0m|\n",
+      "| : |\u001b[43m \u001b[0m: : |\n",
+      "| : : : : |\n",
+      "| | : | : |\n",
+      "|\u001b[34;1mY\u001b[0m| : |B: |\n",
+      "+---------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "state = env.encode(3,1,2,0)\n",
+    "print(\"State :\", state)\n",
+    "env.render()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{0: [(1.0, 428, -1, False)],\n",
+       " 1: [(1.0, 228, -1, False)],\n",
+       " 2: [(1.0, 348, -1, False)],\n",
+       " 3: [(1.0, 328, -1, False)],\n",
+       " 4: [(1.0, 328, -10, False)],\n",
+       " 5: [(1.0, 328, -10, False)]}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.P[328]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Timesteps taken: 1079\n",
+      "Penalties incurred: 354\n"
+     ]
+    }
+   ],
+   "source": [
+    "env.s = 328\n",
+    "epochs = 0\n",
+    "penalties, reward = 0,0\n",
+    "frames= []\n",
+    "done = False\n",
+    "while not done:\n",
+    "    action = env.action_space.sample()\n",
+    "    state, reward, done, info = env.step(action)\n",
+    "    if reward == -10:\n",
+    "        penalties += 1\n",
+    "        \n",
+    "    frames.append({   \n",
+    "    'frame': env.render(mode='ansi'),\n",
+    "        'state': state,\n",
+    "        'action': action,\n",
+    "        'reward': reward\n",
+    "    }\n",
+    "    )\n",
+    "    epochs +=1\n",
+    "print(\"Timesteps taken: {}\".format(epochs))\n",
+    "print(\"Penalties incurred: {}\".format(penalties))\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Timestep: 1079\n",
+      "State: 0\n",
+      "Action: 5\n",
+      "Reward: 20\n"
+     ]
+    }
+   ],
+   "source": [
+    "from IPython.display import clear_output\n",
+    "from time import sleep\n",
+    "\n",
+    "def print_frames(frames):\n",
+    "    for i, frame in enumerate(frames):\n",
+    "        clear_output(wait=True)\n",
+    "#         print(frame['frame'].getvalue())\n",
+    "        print(f\"Timestep: {i + 1}\")\n",
+    "        print(f\"State: {frame['state']}\")\n",
+    "        print(f\"Action: {frame['action']}\")\n",
+    "        print(f\"Reward: {frame['reward']}\")\n",
+    "        sleep(.1)\n",
+    "        \n",
+    "print_frames(frames)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy  as np\n",
+    "q_table = np.zeros([env.observation_space.n, env.action_space.n])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Episode: 100000\n",
+      "Training finished.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\" Training The Agent \"\"\"\n",
+    "import random\n",
+    "from IPython.display import clear_output\n",
+    "\n",
+    "alpha = 0.1\n",
+    "gamma = 0.6\n",
+    "epsilon = 0.1\n",
+    "all_epochs = []\n",
+    "all_penalties = []\n",
+    "for i in range(1, 100001):\n",
+    "    state = env.reset()\n",
+    "    epochs, penalties, reward = 0, 0, 0\n",
+    "    done = False\n",
+    "    \n",
+    "    while not done:\n",
+    "        if random.uniform(0,1) < epsilon:\n",
+    "            action = env.action_space.sample()  #Explore action space\n",
+    "        else:\n",
+    "            action = np.argmax(q_table[state]) #Exploit learned values\n",
+    "        \n",
+    "        next_state, reward, done, info = env.step(action) \n",
+    "        \n",
+    "        old_value = q_table[state, action]\n",
+    "        next_max = np.max(q_table[next_state])\n",
+    "        \n",
+    "        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)\n",
+    "        q_table[state, action] = new_value\n",
+    "\n",
+    "        if reward == -10:\n",
+    "            penalties += 1\n",
+    "\n",
+    "        state = next_state\n",
+    "        epochs += 1\n",
+    "        \n",
+    "    if i % 100 == 0:\n",
+    "        clear_output(wait=True)\n",
+    "        print(f\"Episode: {i}\")\n",
+    "\n",
+    "print(\"Training finished.\\n\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ -2.40943541,  -2.27325184,  -2.41396927,  -2.36299859,\n",
+       "       -10.52639717, -10.68579624])"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "q_table[328]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results after 500 episodes:\n",
+      "Average timesteps per episode: 12.906\n",
+      "Average penalties per episode: 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"Evaluate agent's performance after Q-learning\"\"\"\n",
+    "\n",
+    "total_epochs, total_penalties = 0, 0\n",
+    "episodes = 500\n",
+    "\n",
+    "for _ in range(episodes):\n",
+    "    state = env.reset()\n",
+    "    epochs, penalties, reward = 0, 0, 0\n",
+    "    \n",
+    "    done = False\n",
+    "    \n",
+    "    while not done:\n",
+    "        action = np.argmax(q_table[state])\n",
+    "        state, reward, done, info = env.step(action)\n",
+    "\n",
+    "        if reward == -10:\n",
+    "            penalties += 1\n",
+    "\n",
+    "        epochs += 1\n",
+    "\n",
+    "    total_penalties += penalties\n",
+    "    total_epochs += epochs\n",
+    "\n",
+    "print(f\"Results after {episodes} episodes:\")\n",
+    "print(f\"Average timesteps per episode: {total_epochs / episodes}\")\n",
+    "print(f\"Average penalties per episode: {total_penalties / episodes}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Rewards.py b/Rewards.py
new file mode 100644
index 0000000..6174d33
--- /dev/null
+++ b/Rewards.py
@@ -0,0 +1,161 @@
+"""
+Evaluate an inferred reward function by using it to train a policy in the original env.
+"""
+
+import argparse
+import datetime
+
+import cv2
+import gym
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow as tf
+
+from sacred import Experiment
+from sacred.observers import FileStorageObserver, RunObserver
+
+from stable_baselines import SAC
+from stable_baselines.sac.policies import MlpPolicy as MlpPolicySac
+
+from deep_rlsp.util.results import Artifact, FileExperimentResults
+from deep_rlsp.model import StateVAE
+from deep_rlsp.envs.reward_wrapper import LatentSpaceRewardWrapper
+from deep_rlsp.util.video import render_mujoco_from_obs
+from deep_rlsp.util.helper import get_trajectory, evaluate_policy
+from deep_rlsp.model.mujoco_debug_models import MujocoDebugFeatures, PendulumDynamics
+from deep_rlsp.solvers import get_sac
+
+# changes the run _id and thereby the path that the FileStorageObserver
+# writes the results
+# cf. https://github.com/IDSIA/sacred/issues/174
+class SetID(RunObserver):
+    priority = 50  # very high priority to set id
+
+    def started_event(
+        self, ex_info, command, host_info, start_time, config, meta_info, _id
+    ):
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        label = config["experiment_folder"].strip("/").split("/")[-1]
+        custom_id = "{}_{}".format(timestamp, label)
+        return custom_id  # started_event returns the _run._id
+
+
+ex = Experiment("mujoco-eval")
+ex.observers = [SetID(), FileStorageObserver.create("results/mujoco/eval")]
+
+
+def print_rollout(env, policy, latent_space, decode=False):
+    state = env.reset()
+    done = False
+    while not done:
+        a, _ = policy.predict(state, deterministic=False)
+        state, reward, done, info = env.step(a)
+        if decode:
+            obs = latent_space.decoder(state)
+        else:
+            obs = state
+        print("action", a)
+        print("obs", obs)
+        print("reward", reward)
+
+
+@ex.config
+def config():
+    experiment_folder = None  # noqa:F841
+    iteration = -1  # noqa:F841
+
+
+@ex.automain
+def main(_run, experiment_folder, iteration, seed):
+    ex = FileExperimentResults(experiment_folder)
+    env_id = ex.config["env_id"]
+    env = gym.make(env_id)
+
+    if env_id == "InvertedPendulum-v2":
+        iterations = int(6e4)
+    else:
+        iterations = int(2e6)
+
+    label = experiment_folder.strip("/").split("/")[-1]
+
+    if ex.config["debug_handcoded_features"]:
+        latent_space = MujocoDebugFeatures(env)
+    else:
+        graph_latent = tf.Graph()
+        latent_model_path = ex.info["latent_model_checkpoint"]
+        with graph_latent.as_default():
+            latent_space = StateVAE.restore(latent_model_path)
+
+    r_inferred = ex.info["inferred_rewards"][iteration]
+    r_inferred /= np.linalg.norm(r_inferred)
+
+    print("r_inferred")
+    print(r_inferred)
+    if env_id.startswith("Fetch"):
+        env_has_task_reward = True
+        inferred_weight = 0.1
+    else:
+        env_has_task_reward = False
+        inferred_weight = None
+
+    env_inferred = LatentSpaceRewardWrapper(
+        env,
+        latent_space,
+        r_inferred,
+        inferred_weight=inferred_weight,
+        use_task_reward=env_has_task_reward,
+    )
+
+    policy_inferred = get_sac(env_inferred)
+    policy_inferred.learn(total_timesteps=iterations, log_interval=10)
+    with Artifact(f"policy.zip", None, _run) as f:
+        policy_inferred.save(f)
+
+    print_rollout(env_inferred, policy_inferred, latent_space)
+
+    N = 10
+    true_reward_obtained = evaluate_policy(env, policy_inferred, N)
+    print("Inferred reward policy: true return", true_reward_obtained)
+    if env_has_task_reward:
+        env.use_penalty = False
+        task_reward_obtained = evaluate_policy(env, policy_inferred, N)
+        print("Inferred reward policy: task return", task_reward_obtained)
+        env.use_penalty = True
+    with Artifact(f"video.mp4", None, _run) as f:
+        inferred_reward_obtained = evaluate_policy(
+            env_inferred, policy_inferred, N, video_out=f
+        )
+    print("Inferred reward policy: inferred return", inferred_reward_obtained)
+
+    good_policy_path = ex.config["good_policy_path"]
+    if good_policy_path is not None:
+        true_reward_policy = SAC.load(good_policy_path)
+        good_policy_true_reward_obtained = evaluate_policy(env, true_reward_policy, N)
+        print("True reward policy: true return", good_policy_true_reward_obtained)
+        if env_has_task_reward:
+            env.use_penalty = False
+            good_policy_task_reward_obtained = evaluate_policy(
+                env, true_reward_policy, N
+            )
+            print("True reward policy: task return", good_policy_task_reward_obtained)
+            env.use_penalty = True
+        good_policy_inferred_reward_obtained = evaluate_policy(
+            env_inferred, true_reward_policy, N
+        )
+        print(
+            "True reward policy: inferred return", good_policy_inferred_reward_obtained
+        )
+
+    random_policy = SAC(MlpPolicySac, env_inferred, verbose=1)
+    random_policy_true_reward_obtained = evaluate_policy(env, random_policy, N)
+    print("Random policy: true return", random_policy_true_reward_obtained)
+    if env_has_task_reward:
+        env.use_penalty = False
+        random_policy_task_reward_obtained = evaluate_policy(env, random_policy, N)
+        print("Random reward policy: task return", random_policy_task_reward_obtained)
+        env.use_penalty = True
+    random_policy_inferred_reward_obtained = evaluate_policy(
+        env_inferred, random_policy, N
+    )
+    print("Random policy: inferred return", random_policy_inferred_reward_obtained)
+    print()