Update

Xingdong Zuo · Xingdong Zuo · commit 54c9f0d159a8 · 2018-10-31T15:13:35.000+01:00
Former-commit-id: d615ccd64a2089a302e61651b9761498d3199b44 [formerly f131ab52da6f638cd55a494f9b3ec7d54f6baaa5]
Former-commit-id: ec4abd8754806888f3a47ac67985a8011a751a2b
diff --git a/examples/policy_gradient/vpg/main.ipynb b/examples/policy_gradient/vpg/main.ipynb
@@ -1,5 +1,294 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "libcudart.so.9.2: cannot open shared object file: No such file or directory",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-587e5575a1c6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptim\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0moptim\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/RL/lib/python3.7/site-packages/torch/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     82\u001b[0m     \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     86\u001b[0m __all__ += [name for name in dir(_C)\n",
+      "\u001b[0;31mImportError\u001b[0m: libcudart.so.9.2: cannot open shared object file: No such file or directory"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import torch\n",
+    "import torch.optim as optim\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "from torch.nn.utils import clip_grad_norm_\n",
+    "\n",
+    "from lagom.networks import BaseNetwork\n",
+    "from lagom.networks import make_fc\n",
+    "from lagom.networks import ortho_init\n",
+    "from lagom.networks import linear_lr_scheduler\n",
+    "\n",
+    "from lagom.policies import BasePolicy\n",
+    "from lagom.policies import CategoricalHead\n",
+    "from lagom.policies import DiagGaussianHead\n",
+    "from lagom.policies import constraint_action\n",
+    "\n",
+    "from lagom.value\n",
+    "\n",
+    "from lagom.transform import Standardize\n",
+    "\n",
+    "from lagom.agents import BaseAgent\n",
+    "\n",
+    "\n",
+    "class MLP(BaseNetwork):\n",
+    "    def make_params(self, config):\n",
+    "        self.feature_layers = make_fc(self.env_spec.observation_space.flat_dim, config['network.hidden_sizes'])\n",
+    "        \n",
+    "    def init_params(self, config):\n",
+    "        for layer in self.feature_layers:\n",
+    "            ortho_init(layer, nonlinearity='tanh', constant_bias=0.0)\n",
+    "        \n",
+    "    def reset(self, config, **kwargs):\n",
+    "        pass\n",
+    "        \n",
+    "    def forward(self, x):\n",
+    "        for layer in self.feature_layers:\n",
+    "            x = torch.tanh(layer(x))\n",
+    "            \n",
+    "        return x\n",
+    "    \n",
+    "    \n",
+    "class Policy(BasePolicy):\n",
+    "    def make_networks(self, config):\n",
+    "        self.feature_network = MLP(config, self.device, env_spec=self.env_spec)\n",
+    "        feature_dim = config['network.hidden_sizes'][-1]\n",
+    "        \n",
+    "        if self.env_spec.control_type == 'Discrete':\n",
+    "            self.action_head = CategoricalHead(config, self.device, feature_dim, self.env_spec)\n",
+    "        elif self.env_spec.control_type == 'Continuous':\n",
+    "            self.action_head = DiagGaussianHead(config, \n",
+    "                                                self.device, \n",
+    "                                                feature_dim, \n",
+    "                                                self.env_spec, \n",
+    "                                                min_std=config['agent.min_std'], \n",
+    "                                                std_style=config['agent.std_style'], \n",
+    "                                                constant_std=config['agent.constant_std'],\n",
+    "                                                std_state_dependent=config['agent.std_state_dependent'],\n",
+    "                                                init_std=config['agent.init_std'])\n",
+    "    \n",
+    "    @property\n",
+    "    def recurrent(self):\n",
+    "        return False\n",
+    "    \n",
+    "    def reset(self, config, **kwargs):\n",
+    "        pass\n",
+    "\n",
+    "    def __call__(self, x, out_keys=['action'], info={}, **kwargs):\n",
+    "        out = {}\n",
+    "        \n",
+    "        features = self.feature_network(x)\n",
+    "        action_dist = self.action_head(features)\n",
+    "        \n",
+    "        action = action_dist.sample().detach()################################\n",
+    "        out['action'] = action\n",
+    "        \n",
+    "        if 'action_logprob' in out_keys:\n",
+    "            out['action_logprob'] = action_dist.log_prob(action)\n",
+    "        if 'entropy' in out_keys:\n",
+    "            out['entropy'] = action_dist.entropy()\n",
+    "        if 'perplexity' in out_keys:\n",
+    "            out['perplexity'] = action_dist.perplexity()\n",
+    "        \n",
+    "        return out\n",
+    "    \n",
+    "\n",
+    "class Agent(BaseAgent):\n",
+    "    r\"\"\"REINFORCE (no baseline). \"\"\"\n",
+    "    def make_modules(self, config):\n",
+    "        self.policy = Policy(config, self.env_spec, self.device)\n",
+    "        \n",
+    "    def prepare(self, config, **kwargs):\n",
+    "        self.total_T = 0\n",
+    "        self.optimizer = optim.Adam(self.policy.parameters(), lr=config['algo.lr'])\n",
+    "        if config['algo.use_lr_scheduler']:\n",
+    "            if 'train.iter' in config:\n",
+    "                self.lr_scheduler = linear_lr_scheduler(self.optimizer, config['train.iter'], 'iteration-based')\n",
+    "            elif 'train.timestep' in config:\n",
+    "                self.lr_scheduler = linear_lr_scheduler(self.optimizer, config['train.timestep']+1, 'timestep-based')\n",
+    "        else:\n",
+    "            self.lr_scheduler = None\n",
+    "                \n",
+    "\n",
+    "    def reset(self, config, **kwargs):\n",
+    "        pass\n",
+    "\n",
+    "    def choose_action(self, obs, info={}):\n",
+    "        obs = torch.from_numpy(np.asarray(obs)).float().to(self.device)\n",
+    "        \n",
+    "        out = self.policy(obs, out_keys=['action', 'action_logprob', 'entropy'], info=info)\n",
+    "            \n",
+    "        # sanity check for NaN\n",
+    "        if torch.any(torch.isnan(out['action'])):\n",
+    "            while True:\n",
+    "                print('NaN !')\n",
+    "        if self.env_spec.control_type == 'Continuous':\n",
+    "            out['action'] = constraint_action(self.env_spec, out['action'])\n",
+    "            \n",
+    "        return out\n",
+    "\n",
+    "    def learn(self, D, info={}):\n",
+    "        batch_policy_loss = []\n",
+    "        batch_entropy_loss = []\n",
+    "        batch_total_loss = []\n",
+    "        \n",
+    "        for trajectory in D:\n",
+    "            logprobs = trajectory.all_info('action_logprob')\n",
+    "            entropies = trajectory.all_info('entropy')\n",
+    "            Qs = trajectory.all_discounted_returns(self.config['algo.gamma'])\n",
+    "            \n",
+    "            # Standardize: encourage/discourage half of performed actions\n",
+    "            if self.config['agent.standardize_Q']:\n",
+    "                Qs = Standardize()(Qs, -1).tolist()\n",
+    "            \n",
+    "            policy_loss = []\n",
+    "            entropy_loss = []\n",
+    "            for logprob, entropy, Q in zip(logprobs, entropies, Qs):\n",
+    "                policy_loss.append(-logprob*Q)\n",
+    "                entropy_loss.append(-entropy)\n",
+    "            \n",
+    "            policy_loss = torch.stack(policy_loss).mean()\n",
+    "            entropy_loss = torch.stack(entropy_loss).mean()\n",
+    "            \n",
+    "            entropy_coef = self.config['agent.entropy_coef']\n",
+    "            total_loss = policy_loss + entropy_coef*entropy_loss\n",
+    "            \n",
+    "            batch_policy_loss.append(policy_loss)\n",
+    "            batch_entropy_loss.append(entropy_loss)\n",
+    "            batch_total_loss.append(total_loss)\n",
+    "            \n",
+    "        policy_loss = torch.stack(batch_policy_loss).mean()\n",
+    "        entropy_loss = torch.stack(batch_entropy_loss).mean()\n",
+    "        loss = torch.stack(batch_total_loss).mean()\n",
+    "        \n",
+    "        self.optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        \n",
+    "        if self.config['agent.max_grad_norm'] is not None:\n",
+    "            clip_grad_norm_(self.parameters(), self.config['agent.max_grad_norm'])\n",
+    "            \n",
+    "        if self.lr_scheduler is not None:\n",
+    "            if self.lr_scheduler.mode == 'iteration-based':\n",
+    "                self.lr_scheduler.step()\n",
+    "            elif self.lr_scheduler.mode == 'timestep-based':\n",
+    "                self.lr_scheduler.step(self.total_T)\n",
+    "\n",
+    "        self.optimizer.step()\n",
+    "        \n",
+    "        self.total_T += sum([trajectory.T for trajectory in D])\n",
+    "        \n",
+    "        out = {}\n",
+    "        out['loss'] = loss.item()\n",
+    "        out['policy_loss'] = policy_loss.item()\n",
+    "        out['entropy_loss'] = entropy_loss.item()\n",
+    "        if self.lr_scheduler is not None:\n",
+    "            out['current_lr'] = self.lr_scheduler.get_lr()\n",
+    "\n",
+    "        return out\n",
+    "    \n",
+    "    @property\n",
+    "    def recurrent(self):\n",
+    "        pass\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 24,
@@ -226,7 +515,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.7.0"
   }
  },
  "nbformat": 4,
diff --git a/examples/policy_gradient/vpg/model.py b/examples/policy_gradient/vpg/model.py
@@ -1,13 +1,4 @@
-import numpy as np
 
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils import clip_grad_norm_
-
-from .base_agent import BaseAgent
-
-from lagom.core.transform import Standardize
 
 
 class VPGAgent(BaseAgent):