From 6043e87828926cbd1be56d155f97c45f2f24e3b4 Mon Sep 17 00:00:00 2001
From: Ahmed Nabil <72295771+AI-Ahmed@users.noreply.github.com>
Date: Sun, 23 Oct 2022 22:35:18 +0200
Subject: [PATCH] Added Download Script to download the dataset.

Added Download Script to download the dataset from Yandex.
---
 week05_nlp/part2_pytorch.ipynb | 2195 +++++++++++++++++++-------------
 1 file changed, 1287 insertions(+), 908 deletions(-)
diff --git a/week05_nlp/part2_pytorch.ipynb b/week05_nlp/part2_pytorch.ipynb
index a8f0db57a..743059f52 100644
--- a/week05_nlp/part2_pytorch.ipynb
+++ b/week05_nlp/part2_pytorch.ipynb
@@ -1,911 +1,1290 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Natural Language Processing with Deep Learning (7 points)\n",
-    "\n",
-    "Today we're gonna apply the newly learned DL tools for sequence processing to the task of predicting job salary.\n",
-    "\n",
-    "Special thanks to [Oleg Vasilev](https://github.com/Omrigan/) for the assignment core (orignally written for theano/tensorflow)."
-   ]
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AnKhaDq8xYPR"
+      },
+      "source": [
+        "# Natural Language Processing with Deep Learning (7 points)\n",
+        "\n",
+        "Today we're gonna apply the newly learned DL tools for sequence processing to the task of predicting job salary.\n",
+        "\n",
+        "Special thanks to [Oleg Vasilev](https://github.com/Omrigan/) for the assignment core (orignally written for theano/tensorflow)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "collapsed": true,
+        "id": "KCo6ud10xYPV"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JSqK0JBixYPX"
+      },
+      "source": [
+        "### About the challenge\n",
+        "For starters, let's download the data from __[here](https://yadi.sk/d/vVEOWPFY3NruT7)__.\n",
+        "\n",
+        "You can also get it from the competition [page](https://www.kaggle.com/c/job-salary-prediction/data) (in that case, pick `Train_rev1.*`).\n",
+        "\n",
+        "\n",
+        "Our task is to predict one number, __SalaryNormalized__, in the sense of minimizing __Mean Absolute Error__.\n",
+        "\n",
+        "<img src=\"https://storage.googleapis.com/kaggle-competitions/kaggle/3342/media/salary%20prediction%20engine%20v2.png\" width=400px>\n",
+        "\n",
+        "To do so, our model ca access a number of features:\n",
+        "* Free text: __`Title`__ and  __`FullDescription`__\n",
+        "* Categorical: __`Category`__, __`Company`__, __`LocationNormalized`__, __`ContractType`__, and __`ContractTime`__.\n",
+        "\n",
+        "\n",
+        "You can read more [in the official description](https://www.kaggle.com/c/job-salary-prediction#description)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#### Download the dataset (Thanks to [lowvoltage](https://lowvoltage.github.io/2017/07/29/Yadisk-Direct-Download-Python), edited by [AI-Ahmed](https://github.com/AI-Ahmed))"
+      ],
+      "metadata": {
+        "id": "M94Qg3pKew2l"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import sys\n",
+        "import requests\n",
+        "import zipfile\n",
+        "from glob import glob\n",
+        "from typing import Text\n",
+        "from tqdm.notebook import tqdm\n",
+        "from IPython.display import clear_output\n",
+        "\n",
+        "API_ENDPOINT = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key={}'\n",
+        "\n",
+        "def _get_real_direct_link(sharing_link):\n",
+        "  pk_request = requests.get(API_ENDPOINT.format(sharing_link))\n",
+        "  \n",
+        "  # Returns None if the link cannot be \"converted\"\n",
+        "  return pk_request.json().get('href')\n",
+        "\n",
+        "\n",
+        "def _extract_filename(direct_link):\n",
+        "  print(\"Extracting the filename...\")\n",
+        "  for chunk in direct_link.strip().split('&'):\n",
+        "      if chunk.startswith('filename='):\n",
+        "          return chunk.split('=')[1]\n",
+        "  return None\n",
+        "\n",
+        "\n",
+        "def extract_csv_from_zip(filename: Text) -> None:\n",
+        "  if glob(f'./*.zip'):\n",
+        "    print(\"Extract dataset file...\")\n",
+        "    try:\n",
+        "      with zipfile.ZipFile(filename, 'r') as zipfle:\n",
+        "        zipfle.extractall()\n",
+        "    except:\n",
+        "      sys.print(f\"There is no item named '{filename}' in the archive\")\n",
+        "    clear_output()\n",
+        "    print(\"The dataset has been extracted sucessfully!\")\n",
+        "  else:\n",
+        "    sys.exit(\"File `.zip` Not Found!\")\n",
+        "  \n",
+        "\n",
+        "def download_yadisk_link(sharing_link, filename=None):\n",
+        "  direct_link = _get_real_direct_link(sharing_link)\n",
+        "  if direct_link:\n",
+        "      # Try to recover the filename from the link\n",
+        "      filename = filename or _extract_filename(direct_link)\n",
+        "      clear_output()\n",
+        "      \n",
+        "      print(\"Downloading the file...\")\n",
+        "      download = requests.get(direct_link)\n",
+        "      with open(filename, 'wb') as out_file:\n",
+        "          out_file.write(download.content)\n",
+        "\n",
+        "      clear_output()\n",
+        "      print('Downloaded \"{}\" to \"{}\"'.format(sharing_link, filename))\n",
+        "      clear_output()\n",
+        "      extract_csv_from_zip(filename)\n",
+        "  else:\n",
+        "      print('Failed to download \"{}\"'.format(sharing_link))"
+      ],
+      "metadata": {
+        "id": "8gg7lhfreCmm"
+      },
+      "execution_count": 23,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "download_yadisk_link(\"https://yadi.sk/d/vVEOWPFY3NruT7\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HwOBovvOdZuD",
+        "outputId": "c1213b58-8474-4e55-b2f7-975a0c1631df"
+      },
+      "execution_count": 24,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "The dataset has been extracted sucessfully!\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 25,
+      "metadata": {
+        "collapsed": true,
+        "id": "ZTjSL2cCxYPZ",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 381
+        },
+        "outputId": "df32a5f0-83ca-4b9c-a719-878b6ade8936"
+      },
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "             Id                                           Title  \\\n",
+              "92785  69182809       Residential care home manager  Birmingham   \n",
+              "97710  69268730                   Lead Exploration Geophysicist   \n",
+              "67167  68716041  IT Recruitment Manager – IT Recruitment Sector   \n",
+              "\n",
+              "                                         FullDescription        LocationRaw  \\\n",
+              "92785  We are looking for a Residential care home man...         Birmingham   \n",
+              "97710  This Australian oil and natural gas exploratio...          Australia   \n",
+              "67167  IT Recruitment Manager – IT Recruitment Sector...  London South East   \n",
+              "\n",
+              "      LocationNormalized ContractType ContractTime             Company  \\\n",
+              "92785         Birmingham    full_time          NaN  Purely Health Care   \n",
+              "97710                 UK          NaN    permanent                 NaN   \n",
+              "67167  South East London          NaN    permanent                  5Q   \n",
+              "\n",
+              "                        Category  \\\n",
+              "92785  Healthcare & Nursing Jobs   \n",
+              "97710     Energy, Oil & Gas Jobs   \n",
+              "67167      HR & Recruitment Jobs   \n",
+              "\n",
+              "                                               SalaryRaw  SalaryNormalized  \\\n",
+              "92785                               20,000 - 35,000/Year             27500   \n",
+              "97710                                          150k+ AUD            150000   \n",
+              "67167  From 40,000 to 50,000 per annum + TEAM OVERRID...             45000   \n",
+              "\n",
+              "           SourceName  Log1pSalary  \n",
+              "92785  staffnurse.com    10.221977  \n",
+              "97710      hays.co.uk    11.918397  \n",
+              "67167   totaljobs.com    10.714440  "
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-bfce9625-7fed-4ba4-a4e2-820e79ae4f9e\">\n",
+              "    <div class=\"colab-df-container\">\n",
+              "      <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Id</th>\n",
+              "      <th>Title</th>\n",
+              "      <th>FullDescription</th>\n",
+              "      <th>LocationRaw</th>\n",
+              "      <th>LocationNormalized</th>\n",
+              "      <th>ContractType</th>\n",
+              "      <th>ContractTime</th>\n",
+              "      <th>Company</th>\n",
+              "      <th>Category</th>\n",
+              "      <th>SalaryRaw</th>\n",
+              "      <th>SalaryNormalized</th>\n",
+              "      <th>SourceName</th>\n",
+              "      <th>Log1pSalary</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>92785</th>\n",
+              "      <td>69182809</td>\n",
+              "      <td>Residential care home manager  Birmingham</td>\n",
+              "      <td>We are looking for a Residential care home man...</td>\n",
+              "      <td>Birmingham</td>\n",
+              "      <td>Birmingham</td>\n",
+              "      <td>full_time</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>Purely Health Care</td>\n",
+              "      <td>Healthcare &amp; Nursing Jobs</td>\n",
+              "      <td>20,000 - 35,000/Year</td>\n",
+              "      <td>27500</td>\n",
+              "      <td>staffnurse.com</td>\n",
+              "      <td>10.221977</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>97710</th>\n",
+              "      <td>69268730</td>\n",
+              "      <td>Lead Exploration Geophysicist</td>\n",
+              "      <td>This Australian oil and natural gas exploratio...</td>\n",
+              "      <td>Australia</td>\n",
+              "      <td>UK</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>permanent</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>Energy, Oil &amp; Gas Jobs</td>\n",
+              "      <td>150k+ AUD</td>\n",
+              "      <td>150000</td>\n",
+              "      <td>hays.co.uk</td>\n",
+              "      <td>11.918397</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>67167</th>\n",
+              "      <td>68716041</td>\n",
+              "      <td>IT Recruitment Manager – IT Recruitment Sector</td>\n",
+              "      <td>IT Recruitment Manager – IT Recruitment Sector...</td>\n",
+              "      <td>London South East</td>\n",
+              "      <td>South East London</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>permanent</td>\n",
+              "      <td>5Q</td>\n",
+              "      <td>HR &amp; Recruitment Jobs</td>\n",
+              "      <td>From 40,000 to 50,000 per annum + TEAM OVERRID...</td>\n",
+              "      <td>45000</td>\n",
+              "      <td>totaljobs.com</td>\n",
+              "      <td>10.714440</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>\n",
+              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-bfce9625-7fed-4ba4-a4e2-820e79ae4f9e')\"\n",
+              "              title=\"Convert this dataframe to an interactive table.\"\n",
+              "              style=\"display:none;\">\n",
+              "        \n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "       width=\"24px\">\n",
+              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
+              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
+              "  </svg>\n",
+              "      </button>\n",
+              "      \n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      flex-wrap:wrap;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "      <script>\n",
+              "        const buttonEl =\n",
+              "          document.querySelector('#df-bfce9625-7fed-4ba4-a4e2-820e79ae4f9e button.colab-df-convert');\n",
+              "        buttonEl.style.display =\n",
+              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "        async function convertToInteractive(key) {\n",
+              "          const element = document.querySelector('#df-bfce9625-7fed-4ba4-a4e2-820e79ae4f9e');\n",
+              "          const dataTable =\n",
+              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                     [key], {});\n",
+              "          if (!dataTable) return;\n",
+              "\n",
+              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
+              "            + ' to learn more about interactive tables.';\n",
+              "          element.innerHTML = '';\n",
+              "          dataTable['output_type'] = 'display_data';\n",
+              "          await google.colab.output.renderOutput(dataTable, element);\n",
+              "          const docLink = document.createElement('div');\n",
+              "          docLink.innerHTML = docLinkHtml;\n",
+              "          element.appendChild(docLink);\n",
+              "        }\n",
+              "      </script>\n",
+              "    </div>\n",
+              "  </div>\n",
+              "  "
+            ]
+          },
+          "metadata": {},
+          "execution_count": 25
+        }
+      ],
+      "source": [
+        "data = pd.read_csv(\"./Train_rev1.csv\", index_col=None)\n",
+        "data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')\n",
+        "\n",
+        "text_columns = [\"Title\", \"FullDescription\"]\n",
+        "categorical_columns = [\"Category\", \"Company\", \"LocationNormalized\", \"ContractType\", \"ContractTime\"]\n",
+        "target_column = \"Log1pSalary\"\n",
+        "data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast nan to string\n",
+        "\n",
+        "data.sample(3)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-Fa1BQE0xYPb"
+      },
+      "source": [
+        "### The NLP part\n",
+        "\n",
+        "To even begin training our neural network, we're gonna need to preprocess the text features: tokenize it and build the token vocabularies.\n",
+        "\n",
+        "Since it is not an NLP course, we're gonna use simple built-in NLTK tokenization."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "mATF2HFsxYPc"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Before\")\n",
+        "print(data[\"Title\"][::100000])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "3O8pdtlJxYPd"
+      },
+      "outputs": [],
+      "source": [
+        "import nltk\n",
+        "tokenizer = nltk.tokenize.WordPunctTokenizer()\n",
+        "\n",
+        "for col in text_columns:\n",
+        "    data[col] = data[col].apply(lambda l: ' '.join(tokenizer.tokenize(str(l).lower())))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TNoDy99RxYPe"
+      },
+      "source": [
+        "Now we can assume that our text is a space-separated list of tokens:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "PA_dDQQ7xYPf"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"After\")\n",
+        "print(data[\"Title\"][::100000])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "A1QZyGIlxYPh"
+      },
+      "source": [
+        "Not all words are equally useful. Some of them are typos or rare words that are only present a few times. \n",
+        "\n",
+        "Let's see how many times is each word present in the data so that we can build a \"white list\" of known words."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "W3viwPQQxYPi"
+      },
+      "outputs": [],
+      "source": [
+        "from collections import Counter\n",
+        "token_counts = Counter()\n",
+        "\n",
+        "# Count how many times does each token occur in \"Title\" and \"FullDescription\"\n",
+        "<YOUR CODE HERE>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "gZLPO_FnxYPj"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Total unique tokens :\", len(token_counts))\n",
+        "print('\\n'.join(map(str, token_counts.most_common(n=5))))\n",
+        "print('...')\n",
+        "print('\\n'.join(map(str, token_counts.most_common()[-3:])))\n",
+        "\n",
+        "assert token_counts.most_common(1)[0][1] in  range(2600000, 2700000)\n",
+        "assert len(token_counts) in range(200000, 210000)\n",
+        "print('Correct!')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "tBw1JAiwxYPk"
+      },
+      "outputs": [],
+      "source": [
+        "# Let's see how many words are there for each count\n",
+        "\n",
+        "_=plt.hist(list(token_counts.values()), range=[0, 10**4], bins=50, log=True)\n",
+        "plt.xlabel(\"Counts\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NMb7Y7rFxYPl"
+      },
+      "source": [
+        "__Task 1.1__ Get a list of all tokens that occur at least 10 times."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "TVprO2lRxYPl"
+      },
+      "outputs": [],
+      "source": [
+        "min_count = 10\n",
+        "\n",
+        "# tokens from token_counts keys that had at least min_count occurrences throughout the dataset\n",
+        "tokens = <YOUR CODE HERE>\n",
+        "\n",
+        "# Add a special tokens for unknown and empty words\n",
+        "UNK, PAD = \"UNK\", \"PAD\"\n",
+        "tokens = [UNK, PAD] + tokens"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "xzkFT9z3xYPm"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Tokens left:\", len(tokens))\n",
+        "assert type(tokens)==list\n",
+        "assert len(tokens) in range(32000,35000)\n",
+        "assert 'me' in tokens\n",
+        "assert UNK in tokens\n",
+        "print(\"Correct!\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YCLTYvBLxYPn"
+      },
+      "source": [
+        "__Task 1.2__ Build an inverse token index: a dictionary from token(string) to it's index in `tokens` (int)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "_CLYvEA-xYPo"
+      },
+      "outputs": [],
+      "source": [
+        "token_to_id = <your code here>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "EVrX-1xfxYPp"
+      },
+      "outputs": [],
+      "source": [
+        "assert isinstance(token_to_id, dict)\n",
+        "assert len(token_to_id) == len(tokens)\n",
+        "for tok in tokens:\n",
+        "    assert tokens[token_to_id[tok]] == tok\n",
+        "\n",
+        "print(\"Correct!\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gJWXr60gxYPp"
+      },
+      "source": [
+        "And finally, let's use the vocabulary you've built to map text lines into torch-digestible matrices."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "FDH_iVMexYPq"
+      },
+      "outputs": [],
+      "source": [
+        "UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])\n",
+        "\n",
+        "def as_matrix(sequences, max_len=None):\n",
+        "    \"\"\" Convert a list of tokens into a matrix with padding \"\"\"\n",
+        "    if isinstance(sequences[0], str):\n",
+        "        sequences = list(map(str.split, sequences))\n",
+        "        \n",
+        "    max_len = min(max(map(len, sequences)), max_len or float('inf'))\n",
+        "    \n",
+        "    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))\n",
+        "    for i,seq in enumerate(sequences):\n",
+        "        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]\n",
+        "        matrix[i, :len(row_ix)] = row_ix\n",
+        "    \n",
+        "    return matrix"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "ZSpcpOsFxYPq"
+      },
+      "outputs": [],
+      "source": [
+        "#### print(\"Lines:\")\n",
+        "print('\\n'.join(data[\"Title\"][::100000].values), end='\\n\\n')\n",
+        "print(\"Matrix:\")\n",
+        "print(as_matrix(data[\"Title\"][::100000]))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fSEIEnY2xYPq"
+      },
+      "source": [
+        "Now let's  encode the categirical data we have.\n",
+        "\n",
+        "As usual, we shall use one-hot encoding for simplicity. Kudos if you implement tf-idf, target averaging or pseudo-counter-based encoding."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "PoeHwkczxYPq"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.feature_extraction import DictVectorizer\n",
+        "\n",
+        "# we only consider top-1k most frequent companies to minimize memory usage\n",
+        "top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))\n",
+        "recognized_companies = set(top_companies)\n",
+        "data[\"Company\"] = data[\"Company\"].apply(lambda comp: comp if comp in recognized_companies else \"Other\")\n",
+        "\n",
+        "categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)\n",
+        "categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wjb_HM0axYPr"
+      },
+      "source": [
+        "### The data science part\n",
+        "\n",
+        "Once we've learned to tokenize the data, let's design a machine learning experiment.\n",
+        "\n",
+        "As before, we won't focus too much on validation, opting for a simple train-test split.\n",
+        "\n",
+        "__To be completely rigorous,__ we've comitted a small crime here: we used the whole data for tokenization and vocabulary building. A more strict way would be to do that part on training set only. You may want to do that and measure the magnitude of changes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "1y7dQLchxYPr"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.model_selection import train_test_split\n",
+        "\n",
+        "data_train, data_val = train_test_split(data, test_size=0.1, random_state=42)\n",
+        "\n",
+        "print(\"Train size = \", len(data_train))\n",
+        "print(\"Validation size = \", len(data_val))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "_mEmdhV3xYPs"
+      },
+      "outputs": [],
+      "source": [
+        "def generate_batch(data, batch_size=None, replace=True, max_len=None):\n",
+        "    \"\"\"\n",
+        "    Creates a pytorch-friendly dict from the batch data.\n",
+        "    :returns: a dict with {'title' : int64[batch, title_max_len]\n",
+        "    \"\"\"\n",
+        "    if batch_size is not None:\n",
+        "        data = data.sample(batch_size, replace=replace)\n",
+        "    \n",
+        "    batch = {}\n",
+        "    for col in text_columns:\n",
+        "        batch[col] = as_matrix(data[col].values, max_len)\n",
+        "    \n",
+        "    batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))\n",
+        "    \n",
+        "    if target_column in data.columns:\n",
+        "        batch[target_column] = data[target_column].values\n",
+        "    \n",
+        "    return batch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "mREIidF7xYPs"
+      },
+      "outputs": [],
+      "source": [
+        "generate_batch(data_train, 3, max_len=10)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "C11o0IznxYPt"
+      },
+      "source": [
+        "### Finally, let's talk deep learning\n",
+        "\n",
+        "Out model consists of three branches:\n",
+        "* Title encoder\n",
+        "* Description encoder\n",
+        "* Categorical features encoder\n",
+        "\n",
+        "We will then feed all 3 branches into one common network that predicts salary.\n",
+        "\n",
+        "![scheme](https://github.com/yandexdataschool/Practical_DL/raw/master/homework04/conv_salary_architecture.png)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GP5w1jX2xYPt"
+      },
+      "source": [
+        "By default, both text vectorizers shall use 1d convolutions, followed by global pooling over time."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "PEtzjT_axYPt"
+      },
+      "outputs": [],
+      "source": [
+        "import torch, torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "from torch.autograd import Variable\n",
+        "\n",
+        "class GlobalMaxPooling(nn.Module):\n",
+        "    def __init__(self, dim=-1):\n",
+        "        super(self.__class__, self).__init__()\n",
+        "        self.dim = dim\n",
+        "        \n",
+        "    def forward(self, x):\n",
+        "        return x.max(dim=self.dim)[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "joss8iRhxYPu"
+      },
+      "outputs": [],
+      "source": [
+        "class TitleEncoder(nn.Module):\n",
+        "    def __init__(self, n_tokens=len(tokens), out_size=64):\n",
+        "        \"\"\" \n",
+        "        A simple sequential encoder for titles.\n",
+        "        x -> emb -> conv -> global_max -> relu -> dense\n",
+        "        \"\"\"\n",
+        "        super(self.__class__, self).__init__()\n",
+        "        self.emb = nn.Embedding(n_tokens, 64, padding_idx=PAD_IX)\n",
+        "        self.conv1 = nn.Conv1d(64, out_size, kernel_size=3, padding=1)\n",
+        "        self.pool1 = GlobalMaxPooling()        \n",
+        "        self.dense = nn.Linear(out_size, out_size)\n",
+        "\n",
+        "    def forward(self, text_ix):\n",
+        "        \"\"\"\n",
+        "        :param text_ix: int64 Variable of shape [batch_size, max_len]\n",
+        "        :returns: float32 Variable of shape [batch_size, out_size]\n",
+        "        \"\"\"\n",
+        "        h = self.emb(text_ix)\n",
+        "\n",
+        "        # we transpose from [batch, time, units] to [batch, units, time] to fit Conv1d dim order\n",
+        "        h = torch.transpose(h, 1, 2)\n",
+        "        \n",
+        "        # Apply the layers as defined above. Add some ReLUs before dense.\n",
+        "        <YOUR CODE>\n",
+        "        \n",
+        "        return <YOUR CODE>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "UdB1WMfuxYPv"
+      },
+      "outputs": [],
+      "source": [
+        "title_encoder = TitleEncoder(out_size=64)\n",
+        "\n",
+        "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['Title']))\n",
+        "dummy_v = title_encoder(dummy_x)\n",
+        "\n",
+        "assert isinstance(dummy_v, Variable)\n",
+        "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n",
+        "\n",
+        "del title_encoder\n",
+        "print(\"Seems fine\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dl4zkBYcxYPv"
+      },
+      "source": [
+        "__Task 2.1__ Create description encoder"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "JTABwuBqxYPw"
+      },
+      "outputs": [],
+      "source": [
+        "# Define an encoder for job descriptions.\n",
+        "# Use any means you want so long as it's torch.nn.Module.\n",
+        "<YOUR CODE HERE>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "k-rtbGlbxYPw"
+      },
+      "outputs": [],
+      "source": [
+        "desc_encoder = <Create description encoder>\n",
+        "\n",
+        "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['FullDescription']))\n",
+        "dummy_v = desc_encoder(dummy_x)\n",
+        "\n",
+        "assert isinstance(dummy_v, Variable)\n",
+        "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n",
+        "del desc_encoder\n",
+        "print(\"Seems fine too\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iNQWQUw3xYPw"
+      },
+      "source": [
+        "__ Task 2.2__ Build one network ~~to rule them all~~"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "AhSwatjhxYPx"
+      },
+      "outputs": [],
+      "source": [
+        "class FullNetwork(nn.Module):\n",
+        "    \"\"\"\n",
+        "    This class does all the steps from (title, desc, categorical) features -> predicted target\n",
+        "    It unites title & desc encoders you defined above as long as some layers for head and categorical branch.\n",
+        "    \"\"\"\n",
+        "    \n",
+        "    def __init__(self, n_tokens=len(tokens), n_cat_features=len(categorical_vectorizer.vocabulary_)):\n",
+        "        super(self.__class__, self).__init__()\n",
+        "        \n",
+        "        self.title_encoder = TitleEncoder(out_size=64)\n",
+        "        self.desc_encoder = <YOUR CODE>\n",
+        "        \n",
+        "        # define layers for categorical features. A few dense layers would do.\n",
+        "        <YOUR CODE>\n",
+        "        \n",
+        "        # define \"output\" layers that process depend the three encoded vectors into answer\n",
+        "        <YOUR CODE>\n",
+        "        \n",
+        "        \n",
+        "    def forward(self, title_ix, desc_ix, cat_features):\n",
+        "        \"\"\"\n",
+        "        :param title_ix: int32 Variable [batch, title_len], job titles encoded by as_matrix\n",
+        "        :param desc_ix:  int32 Variable [batch, desc_len] , job descriptions encoded by as_matrix\n",
+        "        :param cat_features: float32 Variable [batch, n_cat_features]\n",
+        "        :returns: float32 Variable 1d [batch], predicted log1p-salary\n",
+        "        \"\"\"\n",
+        "        \n",
+        "        # process each data source with it's respective encoder\n",
+        "        title_h = self.title_encoder(title_ix)\n",
+        "        desc_h = <YOUR CODE>\n",
+        "        \n",
+        "        # apply categorical encoder\n",
+        "        cat_h = <YOUR CODE>\n",
+        "        \n",
+        "        # concatenate all vectors together...\n",
+        "        joint_h = torch.cat([title_h, desc_h, cat_h], dim=1)\n",
+        "        \n",
+        "        # ... and stack a few more layers at the top\n",
+        "        <YOUR CODE>\n",
+        "        \n",
+        "        # Note 1: do not forget to select first columns, [:, 0], to get to 1d outputs\n",
+        "        # Note 2: please do not use output nonlinearities.\n",
+        "        \n",
+        "        return <YOUR CODE>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "1y7jbwTGxYPy"
+      },
+      "outputs": [],
+      "source": [
+        "model = FullNetwork()\n",
+        "opt = torch.optim.Adam(model.parameters(), lr=1e-3)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "P4FEQid5xYPy"
+      },
+      "outputs": [],
+      "source": [
+        "# test it on one batch\n",
+        "\n",
+        "batch = generate_batch(data_train, 32)\n",
+        "\n",
+        "title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
+        "desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
+        "cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
+        "reference = Variable(torch.FloatTensor(batch[target_column]))\n",
+        "\n",
+        "prediction = model(title_ix, desc_ix, cat_features)\n",
+        "\n",
+        "assert len(prediction.shape) == 1 and prediction.shape[0] == title_ix.shape[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "Zp__hcLFxYPz"
+      },
+      "outputs": [],
+      "source": [
+        "def compute_loss(reference, prediction):\n",
+        "    \"\"\"\n",
+        "    Computes objective for minimization.\n",
+        "    By deafult we minimize MSE, but you are encouraged to try mix up MSE, MAE, huber loss, etc.\n",
+        "    \"\"\"\n",
+        "    return torch.mean((prediction - reference) ** 2)\n",
+        "\n",
+        "def compute_mae(reference, prediction):\n",
+        "    \"\"\" Compute MAE on actual salary, assuming your model outputs log1p(salary)\"\"\"\n",
+        "    return torch.abs(torch.exp(reference - 1) - torch.exp(prediction - 1)).mean()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "4aryeHS1xYPz"
+      },
+      "outputs": [],
+      "source": [
+        "loss = compute_loss(reference, prediction)\n",
+        "dummy_grads = torch.autograd.grad(loss, model.parameters(), retain_graph=True)\n",
+        "for grad in dummy_grads:\n",
+        "    assert grad is not None and not (grad == 0).all(), \"Some model parameters received zero grads. \" \\\n",
+        "                                                       \"Double-check that your model uses all it's layers.\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dKlUtZumxYPz"
+      },
+      "source": [
+        "### Let's train it!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "MniUXvdGxYP0"
+      },
+      "outputs": [],
+      "source": [
+        "from tqdm import tnrange\n",
+        "def iterate_minibatches(data, batch_size=32, max_len=None,\n",
+        "                        max_batches=None, shuffle=True, verbose=True):\n",
+        "    indices = np.arange(len(data))\n",
+        "    if shuffle:\n",
+        "        indices = np.random.permutation(indices)\n",
+        "    if max_batches is not None:\n",
+        "        indices = indices[: batch_size * max_batches]\n",
+        "        \n",
+        "    irange = tnrange if verbose else range\n",
+        "    \n",
+        "    for start in irange(0, len(indices), batch_size):\n",
+        "        yield generate_batch(data.iloc[indices[start : start + batch_size]], max_len=max_len)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "pVAHg1Z8xYP1"
+      },
+      "outputs": [],
+      "source": [
+        "num_epochs = 100\n",
+        "max_len = 100\n",
+        "batch_size = 32\n",
+        "batches_per_epoch = 100"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "WHie5jdjxYP1"
+      },
+      "outputs": [],
+      "source": [
+        "for epoch_i in range(num_epochs):\n",
+        "    \n",
+        "    print(\"Training:\")\n",
+        "    train_loss = train_mae = train_batches = 0    \n",
+        "    model.train(True)\n",
+        "    \n",
+        "    for batch in iterate_minibatches(data_train, max_batches=batches_per_epoch):\n",
+        "\n",
+        "        title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
+        "        desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
+        "        cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
+        "        reference = Variable(torch.FloatTensor(batch[target_column]))\n",
+        "\n",
+        "        prediction = model(title_ix, desc_ix, cat_features)\n",
+        "\n",
+        "        loss = compute_loss(reference, prediction)\n",
+        "        loss.backward()\n",
+        "        opt.step()\n",
+        "        opt.zero_grad()\n",
+        "\n",
+        "        train_loss += loss.data.numpy()\n",
+        "        train_mae += compute_mae(reference, prediction).data.numpy()\n",
+        "        train_batches += 1\n",
+        "    \n",
+        "    print(\"\\tLoss:\\t%.5f\" % (train_loss / train_batches))\n",
+        "    print(\"\\tMAE:\\t%.5f\" % (train_mae / train_batches))\n",
+        "    print('\\n\\n')\n",
+        "    \n",
+        "    print(\"Validation:\")\n",
+        "    val_loss = val_mae = val_batches = 0\n",
+        "    model.train(False)\n",
+        "    \n",
+        "    with torch.no_grad():\n",
+        "        for batch in iterate_minibatches(data_val, shuffle=False):\n",
+        "            title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
+        "            desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
+        "            cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
+        "            reference = Variable(torch.FloatTensor(batch[target_column]))\n",
+        "\n",
+        "            prediction = model(title_ix, desc_ix, cat_features)\n",
+        "            loss = compute_loss(reference, prediction)\n",
+        "\n",
+        "            val_loss += loss.data.numpy()\n",
+        "            val_mae += compute_mae(reference, prediction).data.numpy()\n",
+        "            val_batches += 1\n",
+        "\n",
+        "    print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n",
+        "    print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n",
+        "    print('\\n\\n')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "5kJvB3CRxYP2"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Final eval:\")\n",
+        "val_loss = val_mae = val_batches = 0\n",
+        "\n",
+        "with torch.no_grad():\n",
+        "    for batch in iterate_minibatches(data_val, shuffle=False):\n",
+        "        title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
+        "        desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
+        "        cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
+        "        reference = Variable(torch.FloatTensor(batch[target_column]))\n",
+        "\n",
+        "        prediction = model(title_ix, desc_ix, cat_features)\n",
+        "        loss = compute_loss(reference, prediction)\n",
+        "\n",
+        "        val_loss += loss.data.numpy()\n",
+        "        val_mae += compute_mae(reference, prediction).data.numpy()\n",
+        "        val_batches += 1\n",
+        "\n",
+        "print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n",
+        "print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n",
+        "print('\\n\\n')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yAVbs759xYP2"
+      },
+      "source": [
+        "### Task 3.2: Actually make it work\n",
+        "\n",
+        "Your main task is to use some of the tricks you've learned on the network and analyze if you can improve __validation MAE__.\n",
+        "\n",
+        "Try __at least 3 options__ from the list below for a passing grade. If you're into \n",
+        "\n",
+        "#### A) CNN architecture\n",
+        "\n",
+        "All the tricks you know about dense and convolutional neural networks apply here as well.\n",
+        "* Dropout. Nuff said.\n",
+        "* Batch Norm. This time it's `nn.BatchNorm1d`\n",
+        "* Parallel convolution layers. The idea is that you apply several nn.Conv1d to the same embeddings and concatenate output channels.\n",
+        "* More layers, more neurons, ya know...\n",
+        "\n",
+        "\n",
+        "#### B) Play with pooling\n",
+        "\n",
+        "There's more than one way to do max pooling:\n",
+        "* Max over time - our `GlobalMaxPooling`\n",
+        "* Average over time (excluding PAD)\n",
+        "* Softmax-pooling:\n",
+        "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot {{e ^ {h_{i, t}}} \\over \\sum_\\tau e ^ {h_{j, \\tau}} } }$$\n",
+        "\n",
+        "* Attentive pooling\n",
+        "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot Attn(h_t)}$$\n",
+        "\n",
+        ", where $$ Attn(h_t) = {{e ^ {NN_{attn}(h_t)}} \\over \\sum_\\tau e ^ {NN_{attn}(h_\\tau)}}  $$\n",
+        "and $NN_{attn}$ is a small neural network\n",
+        "\n",
+        "\n",
+        "The optimal score is usually achieved by concatenating several different poolings, including several attentive pooling with different $NN_{attn}$\n",
+        "\n",
+        "#### C) Fun with embeddings\n",
+        "\n",
+        "It's not always a good idea to train embeddings from scratch. Here's a few tricks:\n",
+        "\n",
+        "* Use a pre-trained word2vec from [here](http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/) or [here](http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/).\n",
+        "* Start with pre-trained embeddings, then fine-tune them with gradient descent\n",
+        "* Use the same embedding matrix in title and desc vectorizer\n",
+        "\n",
+        "#### D) Going recurrent\n",
+        "\n",
+        "We've already learned that recurrent networks can do cool stuff in sequence modelling. Turns out, they're not useless for classification as well. With some tricks of course..\n",
+        "\n",
+        "* Like convolutional layers, LSTM should be pooled into a fixed-size vector with some of the poolings.\n",
+        "  * Please bear in mind that while convolution uses [batch, units, time] dim order, \n",
+        "    recurrent units are built for [batch, time, unit]. You may need to `torch.transpose`.\n",
+        "\n",
+        "* Since you know all the text in advance, use bidirectional RNN\n",
+        "  * Run one LSTM from left to right\n",
+        "  * Run another in parallel from right to left \n",
+        "  * Concatenate their output sequences along unit axis (dim=-1)\n",
+        "\n",
+        "* It might be good idea to mix convolutions and recurrent layers differently for title and description\n",
+        "\n",
+        "\n",
+        "#### E) Optimizing seriously\n",
+        "\n",
+        "* You don't necessarily need 100 epochs. Use early stopping. If you've never done this before, take a look at [keras](https://github.com/keras-team/keras/blob/master/keras/callbacks.py#L461) for inspiration.\n",
+        "  * In short, train until you notice that validation\n",
+        "  * Maintain the best-on-validation snapshot via `model.state_dict`\n",
+        "  * Plotting learning curves is usually a good idea"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yRyIn5htxYP3"
+      },
+      "source": [
+        "### A short report\n",
+        "\n",
+        "Please tell us what you did and how did it work.\n",
+        "\n",
+        "`<YOUR_TEXT_HERE>`, i guess..."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "tDSMuYg3xYP4"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.8"
+    },
+    "colab": {
+      "provenance": []
+    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "%matplotlib inline"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### About the challenge\n",
-    "For starters, let's download the data from __[here](https://yadi.sk/d/vVEOWPFY3NruT7)__.\n",
-    "\n",
-    "You can also get it from the competition [page](https://www.kaggle.com/c/job-salary-prediction/data) (in that case, pick `Train_rev1.*`).\n",
-    "\n",
-    "\n",
-    "Our task is to predict one number, __SalaryNormalized__, in the sense of minimizing __Mean Absolute Error__.\n",
-    "\n",
-    "<img src=\"https://storage.googleapis.com/kaggle-competitions/kaggle/3342/media/salary%20prediction%20engine%20v2.png\" width=400px>\n",
-    "\n",
-    "To do so, our model ca access a number of features:\n",
-    "* Free text: __`Title`__ and  __`FullDescription`__\n",
-    "* Categorical: __`Category`__, __`Company`__, __`LocationNormalized`__, __`ContractType`__, and __`ContractTime`__.\n",
-    "\n",
-    "\n",
-    "You can read more [in the official description](https://www.kaggle.com/c/job-salary-prediction#description)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "data = pd.read_csv(\"./Train_rev1.csv\", index_col=None)\n",
-    "data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')\n",
-    "\n",
-    "text_columns = [\"Title\", \"FullDescription\"]\n",
-    "categorical_columns = [\"Category\", \"Company\", \"LocationNormalized\", \"ContractType\", \"ContractTime\"]\n",
-    "target_column = \"Log1pSalary\"\n",
-    "data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast nan to string\n",
-    "\n",
-    "data.sample(3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### The NLP part\n",
-    "\n",
-    "To even begin training our neural network, we're gonna need to preprocess the text features: tokenize it and build the token vocabularies.\n",
-    "\n",
-    "Since it is not an NLP course, we're gonna use simple built-in NLTK tokenization."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "print(\"Before\")\n",
-    "print(data[\"Title\"][::100000])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "import nltk\n",
-    "tokenizer = nltk.tokenize.WordPunctTokenizer()\n",
-    "\n",
-    "for col in text_columns:\n",
-    "    data[col] = data[col].apply(lambda l: ' '.join(tokenizer.tokenize(str(l).lower())))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now we can assume that our text is a space-separated list of tokens:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "print(\"After\")\n",
-    "print(data[\"Title\"][::100000])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Not all words are equally useful. Some of them are typos or rare words that are only present a few times. \n",
-    "\n",
-    "Let's see how many times is each word present in the data so that we can build a \"white list\" of known words."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from collections import Counter\n",
-    "token_counts = Counter()\n",
-    "\n",
-    "# Count how many times does each token occur in \"Title\" and \"FullDescription\"\n",
-    "<YOUR CODE HERE>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "print(\"Total unique tokens :\", len(token_counts))\n",
-    "print('\\n'.join(map(str, token_counts.most_common(n=5))))\n",
-    "print('...')\n",
-    "print('\\n'.join(map(str, token_counts.most_common()[-3:])))\n",
-    "\n",
-    "assert token_counts.most_common(1)[0][1] in  range(2600000, 2700000)\n",
-    "assert len(token_counts) in range(200000, 210000)\n",
-    "print('Correct!')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# Let's see how many words are there for each count\n",
-    "\n",
-    "_=plt.hist(list(token_counts.values()), range=[0, 10**4], bins=50, log=True)\n",
-    "plt.xlabel(\"Counts\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "__Task 1.1__ Get a list of all tokens that occur at least 10 times."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "min_count = 10\n",
-    "\n",
-    "# tokens from token_counts keys that had at least min_count occurrences throughout the dataset\n",
-    "tokens = <YOUR CODE HERE>\n",
-    "\n",
-    "# Add a special tokens for unknown and empty words\n",
-    "UNK, PAD = \"UNK\", \"PAD\"\n",
-    "tokens = [UNK, PAD] + tokens"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "print(\"Tokens left:\", len(tokens))\n",
-    "assert type(tokens)==list\n",
-    "assert len(tokens) in range(32000,35000)\n",
-    "assert 'me' in tokens\n",
-    "assert UNK in tokens\n",
-    "print(\"Correct!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "__Task 1.2__ Build an inverse token index: a dictionary from token(string) to it's index in `tokens` (int)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "token_to_id = <your code here>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "assert isinstance(token_to_id, dict)\n",
-    "assert len(token_to_id) == len(tokens)\n",
-    "for tok in tokens:\n",
-    "    assert tokens[token_to_id[tok]] == tok\n",
-    "\n",
-    "print(\"Correct!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "And finally, let's use the vocabulary you've built to map text lines into torch-digestible matrices."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])\n",
-    "\n",
-    "def as_matrix(sequences, max_len=None):\n",
-    "    \"\"\" Convert a list of tokens into a matrix with padding \"\"\"\n",
-    "    if isinstance(sequences[0], str):\n",
-    "        sequences = list(map(str.split, sequences))\n",
-    "        \n",
-    "    max_len = min(max(map(len, sequences)), max_len or float('inf'))\n",
-    "    \n",
-    "    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))\n",
-    "    for i,seq in enumerate(sequences):\n",
-    "        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]\n",
-    "        matrix[i, :len(row_ix)] = row_ix\n",
-    "    \n",
-    "    return matrix"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "#### print(\"Lines:\")\n",
-    "print('\\n'.join(data[\"Title\"][::100000].values), end='\\n\\n')\n",
-    "print(\"Matrix:\")\n",
-    "print(as_matrix(data[\"Title\"][::100000]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now let's  encode the categirical data we have.\n",
-    "\n",
-    "As usual, we shall use one-hot encoding for simplicity. Kudos if you implement tf-idf, target averaging or pseudo-counter-based encoding."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from sklearn.feature_extraction import DictVectorizer\n",
-    "\n",
-    "# we only consider top-1k most frequent companies to minimize memory usage\n",
-    "top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))\n",
-    "recognized_companies = set(top_companies)\n",
-    "data[\"Company\"] = data[\"Company\"].apply(lambda comp: comp if comp in recognized_companies else \"Other\")\n",
-    "\n",
-    "categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)\n",
-    "categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### The data science part\n",
-    "\n",
-    "Once we've learned to tokenize the data, let's design a machine learning experiment.\n",
-    "\n",
-    "As before, we won't focus too much on validation, opting for a simple train-test split.\n",
-    "\n",
-    "__To be completely rigorous,__ we've comitted a small crime here: we used the whole data for tokenization and vocabulary building. A more strict way would be to do that part on training set only. You may want to do that and measure the magnitude of changes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "data_train, data_val = train_test_split(data, test_size=0.1, random_state=42)\n",
-    "\n",
-    "print(\"Train size = \", len(data_train))\n",
-    "print(\"Validation size = \", len(data_val))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def generate_batch(data, batch_size=None, replace=True, max_len=None):\n",
-    "    \"\"\"\n",
-    "    Creates a pytorch-friendly dict from the batch data.\n",
-    "    :returns: a dict with {'title' : int64[batch, title_max_len]\n",
-    "    \"\"\"\n",
-    "    if batch_size is not None:\n",
-    "        data = data.sample(batch_size, replace=replace)\n",
-    "    \n",
-    "    batch = {}\n",
-    "    for col in text_columns:\n",
-    "        batch[col] = as_matrix(data[col].values, max_len)\n",
-    "    \n",
-    "    batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))\n",
-    "    \n",
-    "    if target_column in data.columns:\n",
-    "        batch[target_column] = data[target_column].values\n",
-    "    \n",
-    "    return batch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "generate_batch(data_train, 3, max_len=10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Finally, let's talk deep learning\n",
-    "\n",
-    "Out model consists of three branches:\n",
-    "* Title encoder\n",
-    "* Description encoder\n",
-    "* Categorical features encoder\n",
-    "\n",
-    "We will then feed all 3 branches into one common network that predicts salary.\n",
-    "\n",
-    "![scheme](https://github.com/yandexdataschool/Practical_DL/raw/master/homework04/conv_salary_architecture.png)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "By default, both text vectorizers shall use 1d convolutions, followed by global pooling over time."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "import torch, torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "from torch.autograd import Variable\n",
-    "\n",
-    "class GlobalMaxPooling(nn.Module):\n",
-    "    def __init__(self, dim=-1):\n",
-    "        super(self.__class__, self).__init__()\n",
-    "        self.dim = dim\n",
-    "        \n",
-    "    def forward(self, x):\n",
-    "        return x.max(dim=self.dim)[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "class TitleEncoder(nn.Module):\n",
-    "    def __init__(self, n_tokens=len(tokens), out_size=64):\n",
-    "        \"\"\" \n",
-    "        A simple sequential encoder for titles.\n",
-    "        x -> emb -> conv -> global_max -> relu -> dense\n",
-    "        \"\"\"\n",
-    "        super(self.__class__, self).__init__()\n",
-    "        self.emb = nn.Embedding(n_tokens, 64, padding_idx=PAD_IX)\n",
-    "        self.conv1 = nn.Conv1d(64, out_size, kernel_size=3, padding=1)\n",
-    "        self.pool1 = GlobalMaxPooling()        \n",
-    "        self.dense = nn.Linear(out_size, out_size)\n",
-    "\n",
-    "    def forward(self, text_ix):\n",
-    "        \"\"\"\n",
-    "        :param text_ix: int64 Variable of shape [batch_size, max_len]\n",
-    "        :returns: float32 Variable of shape [batch_size, out_size]\n",
-    "        \"\"\"\n",
-    "        h = self.emb(text_ix)\n",
-    "\n",
-    "        # we transpose from [batch, time, units] to [batch, units, time] to fit Conv1d dim order\n",
-    "        h = torch.transpose(h, 1, 2)\n",
-    "        \n",
-    "        # Apply the layers as defined above. Add some ReLUs before dense.\n",
-    "        <YOUR CODE>\n",
-    "        \n",
-    "        return <YOUR CODE>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "title_encoder = TitleEncoder(out_size=64)\n",
-    "\n",
-    "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['Title']))\n",
-    "dummy_v = title_encoder(dummy_x)\n",
-    "\n",
-    "assert isinstance(dummy_v, Variable)\n",
-    "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n",
-    "\n",
-    "del title_encoder\n",
-    "print(\"Seems fine\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "__Task 2.1__ Create description encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# Define an encoder for job descriptions.\n",
-    "# Use any means you want so long as it's torch.nn.Module.\n",
-    "<YOUR CODE HERE>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "desc_encoder = <Create description encoder>\n",
-    "\n",
-    "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['FullDescription']))\n",
-    "dummy_v = desc_encoder(dummy_x)\n",
-    "\n",
-    "assert isinstance(dummy_v, Variable)\n",
-    "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n",
-    "del desc_encoder\n",
-    "print(\"Seems fine too\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "__ Task 2.2__ Build one network ~~to rule them all~~"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "class FullNetwork(nn.Module):\n",
-    "    \"\"\"\n",
-    "    This class does all the steps from (title, desc, categorical) features -> predicted target\n",
-    "    It unites title & desc encoders you defined above as long as some layers for head and categorical branch.\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    def __init__(self, n_tokens=len(tokens), n_cat_features=len(categorical_vectorizer.vocabulary_)):\n",
-    "        super(self.__class__, self).__init__()\n",
-    "        \n",
-    "        self.title_encoder = TitleEncoder(out_size=64)\n",
-    "        self.desc_encoder = <YOUR CODE>\n",
-    "        \n",
-    "        # define layers for categorical features. A few dense layers would do.\n",
-    "        <YOUR CODE>\n",
-    "        \n",
-    "        # define \"output\" layers that process depend the three encoded vectors into answer\n",
-    "        <YOUR CODE>\n",
-    "        \n",
-    "        \n",
-    "    def forward(self, title_ix, desc_ix, cat_features):\n",
-    "        \"\"\"\n",
-    "        :param title_ix: int32 Variable [batch, title_len], job titles encoded by as_matrix\n",
-    "        :param desc_ix:  int32 Variable [batch, desc_len] , job descriptions encoded by as_matrix\n",
-    "        :param cat_features: float32 Variable [batch, n_cat_features]\n",
-    "        :returns: float32 Variable 1d [batch], predicted log1p-salary\n",
-    "        \"\"\"\n",
-    "        \n",
-    "        # process each data source with it's respective encoder\n",
-    "        title_h = self.title_encoder(title_ix)\n",
-    "        desc_h = <YOUR CODE>\n",
-    "        \n",
-    "        # apply categorical encoder\n",
-    "        cat_h = <YOUR CODE>\n",
-    "        \n",
-    "        # concatenate all vectors together...\n",
-    "        joint_h = torch.cat([title_h, desc_h, cat_h], dim=1)\n",
-    "        \n",
-    "        # ... and stack a few more layers at the top\n",
-    "        <YOUR CODE>\n",
-    "        \n",
-    "        # Note 1: do not forget to select first columns, [:, 0], to get to 1d outputs\n",
-    "        # Note 2: please do not use output nonlinearities.\n",
-    "        \n",
-    "        return <YOUR CODE>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "model = FullNetwork()\n",
-    "opt = torch.optim.Adam(model.parameters(), lr=1e-3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# test it on one batch\n",
-    "\n",
-    "batch = generate_batch(data_train, 32)\n",
-    "\n",
-    "title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
-    "desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
-    "cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
-    "reference = Variable(torch.FloatTensor(batch[target_column]))\n",
-    "\n",
-    "prediction = model(title_ix, desc_ix, cat_features)\n",
-    "\n",
-    "assert len(prediction.shape) == 1 and prediction.shape[0] == title_ix.shape[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def compute_loss(reference, prediction):\n",
-    "    \"\"\"\n",
-    "    Computes objective for minimization.\n",
-    "    By deafult we minimize MSE, but you are encouraged to try mix up MSE, MAE, huber loss, etc.\n",
-    "    \"\"\"\n",
-    "    return torch.mean((prediction - reference) ** 2)\n",
-    "\n",
-    "def compute_mae(reference, prediction):\n",
-    "    \"\"\" Compute MAE on actual salary, assuming your model outputs log1p(salary)\"\"\"\n",
-    "    return torch.abs(torch.exp(reference - 1) - torch.exp(prediction - 1)).mean()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "loss = compute_loss(reference, prediction)\n",
-    "dummy_grads = torch.autograd.grad(loss, model.parameters(), retain_graph=True)\n",
-    "for grad in dummy_grads:\n",
-    "    assert grad is not None and not (grad == 0).all(), \"Some model parameters received zero grads. \" \\\n",
-    "                                                       \"Double-check that your model uses all it's layers.\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Let's train it!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from tqdm import tnrange\n",
-    "def iterate_minibatches(data, batch_size=32, max_len=None,\n",
-    "                        max_batches=None, shuffle=True, verbose=True):\n",
-    "    indices = np.arange(len(data))\n",
-    "    if shuffle:\n",
-    "        indices = np.random.permutation(indices)\n",
-    "    if max_batches is not None:\n",
-    "        indices = indices[: batch_size * max_batches]\n",
-    "        \n",
-    "    irange = tnrange if verbose else range\n",
-    "    \n",
-    "    for start in irange(0, len(indices), batch_size):\n",
-    "        yield generate_batch(data.iloc[indices[start : start + batch_size]], max_len=max_len)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "num_epochs = 100\n",
-    "max_len = 100\n",
-    "batch_size = 32\n",
-    "batches_per_epoch = 100"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "for epoch_i in range(num_epochs):\n",
-    "    \n",
-    "    print(\"Training:\")\n",
-    "    train_loss = train_mae = train_batches = 0    \n",
-    "    model.train(True)\n",
-    "    \n",
-    "    for batch in iterate_minibatches(data_train, max_batches=batches_per_epoch):\n",
-    "\n",
-    "        title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
-    "        desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
-    "        cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
-    "        reference = Variable(torch.FloatTensor(batch[target_column]))\n",
-    "\n",
-    "        prediction = model(title_ix, desc_ix, cat_features)\n",
-    "\n",
-    "        loss = compute_loss(reference, prediction)\n",
-    "        loss.backward()\n",
-    "        opt.step()\n",
-    "        opt.zero_grad()\n",
-    "\n",
-    "        train_loss += loss.data.numpy()\n",
-    "        train_mae += compute_mae(reference, prediction).data.numpy()\n",
-    "        train_batches += 1\n",
-    "    \n",
-    "    print(\"\\tLoss:\\t%.5f\" % (train_loss / train_batches))\n",
-    "    print(\"\\tMAE:\\t%.5f\" % (train_mae / train_batches))\n",
-    "    print('\\n\\n')\n",
-    "    \n",
-    "    print(\"Validation:\")\n",
-    "    val_loss = val_mae = val_batches = 0\n",
-    "    model.train(False)\n",
-    "    \n",
-    "    with torch.no_grad():\n",
-    "        for batch in iterate_minibatches(data_val, shuffle=False):\n",
-    "            title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
-    "            desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
-    "            cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
-    "            reference = Variable(torch.FloatTensor(batch[target_column]))\n",
-    "\n",
-    "            prediction = model(title_ix, desc_ix, cat_features)\n",
-    "            loss = compute_loss(reference, prediction)\n",
-    "\n",
-    "            val_loss += loss.data.numpy()\n",
-    "            val_mae += compute_mae(reference, prediction).data.numpy()\n",
-    "            val_batches += 1\n",
-    "\n",
-    "    print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n",
-    "    print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n",
-    "    print('\\n\\n')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "print(\"Final eval:\")\n",
-    "val_loss = val_mae = val_batches = 0\n",
-    "\n",
-    "with torch.no_grad():\n",
-    "    for batch in iterate_minibatches(data_val, shuffle=False):\n",
-    "        title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n",
-    "        desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n",
-    "        cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n",
-    "        reference = Variable(torch.FloatTensor(batch[target_column]))\n",
-    "\n",
-    "        prediction = model(title_ix, desc_ix, cat_features)\n",
-    "        loss = compute_loss(reference, prediction)\n",
-    "\n",
-    "        val_loss += loss.data.numpy()\n",
-    "        val_mae += compute_mae(reference, prediction).data.numpy()\n",
-    "        val_batches += 1\n",
-    "\n",
-    "print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n",
-    "print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n",
-    "print('\\n\\n')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Task 3.2: Actually make it work\n",
-    "\n",
-    "Your main task is to use some of the tricks you've learned on the network and analyze if you can improve __validation MAE__.\n",
-    "\n",
-    "Try __at least 3 options__ from the list below for a passing grade. If you're into \n",
-    "\n",
-    "#### A) CNN architecture\n",
-    "\n",
-    "All the tricks you know about dense and convolutional neural networks apply here as well.\n",
-    "* Dropout. Nuff said.\n",
-    "* Batch Norm. This time it's `nn.BatchNorm1d`\n",
-    "* Parallel convolution layers. The idea is that you apply several nn.Conv1d to the same embeddings and concatenate output channels.\n",
-    "* More layers, more neurons, ya know...\n",
-    "\n",
-    "\n",
-    "#### B) Play with pooling\n",
-    "\n",
-    "There's more than one way to do max pooling:\n",
-    "* Max over time - our `GlobalMaxPooling`\n",
-    "* Average over time (excluding PAD)\n",
-    "* Softmax-pooling:\n",
-    "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot {{e ^ {h_{i, t}}} \\over \\sum_\\tau e ^ {h_{j, \\tau}} } }$$\n",
-    "\n",
-    "* Attentive pooling\n",
-    "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot Attn(h_t)}$$\n",
-    "\n",
-    ", where $$ Attn(h_t) = {{e ^ {NN_{attn}(h_t)}} \\over \\sum_\\tau e ^ {NN_{attn}(h_\\tau)}}  $$\n",
-    "and $NN_{attn}$ is a small neural network\n",
-    "\n",
-    "\n",
-    "The optimal score is usually achieved by concatenating several different poolings, including several attentive pooling with different $NN_{attn}$\n",
-    "\n",
-    "#### C) Fun with embeddings\n",
-    "\n",
-    "It's not always a good idea to train embeddings from scratch. Here's a few tricks:\n",
-    "\n",
-    "* Use a pre-trained word2vec from [here](http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/) or [here](http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/).\n",
-    "* Start with pre-trained embeddings, then fine-tune them with gradient descent\n",
-    "* Use the same embedding matrix in title and desc vectorizer\n",
-    "\n",
-    "#### D) Going recurrent\n",
-    "\n",
-    "We've already learned that recurrent networks can do cool stuff in sequence modelling. Turns out, they're not useless for classification as well. With some tricks of course..\n",
-    "\n",
-    "* Like convolutional layers, LSTM should be pooled into a fixed-size vector with some of the poolings.\n",
-    "  * Please bear in mind that while convolution uses [batch, units, time] dim order, \n",
-    "    recurrent units are built for [batch, time, unit]. You may need to `torch.transpose`.\n",
-    "\n",
-    "* Since you know all the text in advance, use bidirectional RNN\n",
-    "  * Run one LSTM from left to right\n",
-    "  * Run another in parallel from right to left \n",
-    "  * Concatenate their output sequences along unit axis (dim=-1)\n",
-    "\n",
-    "* It might be good idea to mix convolutions and recurrent layers differently for title and description\n",
-    "\n",
-    "\n",
-    "#### E) Optimizing seriously\n",
-    "\n",
-    "* You don't necessarily need 100 epochs. Use early stopping. If you've never done this before, take a look at [keras](https://github.com/keras-team/keras/blob/master/keras/callbacks.py#L461) for inspiration.\n",
-    "  * In short, train until you notice that validation\n",
-    "  * Maintain the best-on-validation snapshot via `model.state_dict`\n",
-    "  * Plotting learning curves is usually a good idea"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### A short report\n",
-    "\n",
-    "Please tell us what you did and how did it work.\n",
-    "\n",
-    "`<YOUR_TEXT_HERE>`, i guess..."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+  "nbformat": 4,
+  "nbformat_minor": 0
 }

	Id	Title	FullDescription	LocationRaw	LocationNormalized	ContractType	ContractTime	Company	Category	SalaryRaw	SalaryNormalized	SourceName	Log1pSalary
92785	69182809	Residential care home manager Birmingham	We are looking for a Residential care home man...	Birmingham	Birmingham	full_time	NaN	Purely Health Care	Healthcare & Nursing Jobs	20,000 - 35,000/Year	27500	staffnurse.com	10.221977
97710	69268730	Lead Exploration Geophysicist	This Australian oil and natural gas exploratio...	Australia	UK	NaN	permanent	NaN	Energy, Oil & Gas Jobs	150k+ AUD	150000	hays.co.uk	11.918397
67167	68716041	IT Recruitment Manager – IT Recruitment Sector	IT Recruitment Manager – IT Recruitment Sector...	London South East	South East London	NaN	permanent	5Q	HR & Recruitment Jobs	From 40,000 to 50,000 per annum + TEAM OVERRID...	45000	totaljobs.com	10.714440