From 6043e87828926cbd1be56d155f97c45f2f24e3b4 Mon Sep 17 00:00:00 2001 From: Ahmed Nabil <72295771+AI-Ahmed@users.noreply.github.com> Date: Sun, 23 Oct 2022 22:35:18 +0200 Subject: [PATCH] Added Download Script to download the dataset. Added Download Script to download the dataset from Yandex. --- week05_nlp/part2_pytorch.ipynb | 2195 +++++++++++++++++++------------- 1 file changed, 1287 insertions(+), 908 deletions(-) diff --git a/week05_nlp/part2_pytorch.ipynb b/week05_nlp/part2_pytorch.ipynb index a8f0db57a..743059f52 100644 --- a/week05_nlp/part2_pytorch.ipynb +++ b/week05_nlp/part2_pytorch.ipynb @@ -1,911 +1,1290 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Natural Language Processing with Deep Learning (7 points)\n", - "\n", - "Today we're gonna apply the newly learned DL tools for sequence processing to the task of predicting job salary.\n", - "\n", - "Special thanks to [Oleg Vasilev](https://github.com/Omrigan/) for the assignment core (orignally written for theano/tensorflow)." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "AnKhaDq8xYPR" + }, + "source": [ + "# Natural Language Processing with Deep Learning (7 points)\n", + "\n", + "Today we're gonna apply the newly learned DL tools for sequence processing to the task of predicting job salary.\n", + "\n", + "Special thanks to [Oleg Vasilev](https://github.com/Omrigan/) for the assignment core (orignally written for theano/tensorflow)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "id": "KCo6ud10xYPV" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JSqK0JBixYPX" + }, + "source": [ + "### About the challenge\n", + "For starters, let's download the data from __[here](https://yadi.sk/d/vVEOWPFY3NruT7)__.\n", + "\n", + "You can also get it from the competition [page](https://www.kaggle.com/c/job-salary-prediction/data) (in that case, pick `Train_rev1.*`).\n", + "\n", + "\n", + "Our task is to predict one number, __SalaryNormalized__, in the sense of minimizing __Mean Absolute Error__.\n", + "\n", + "\n", + "\n", + "To do so, our model ca access a number of features:\n", + "* Free text: __`Title`__ and __`FullDescription`__\n", + "* Categorical: __`Category`__, __`Company`__, __`LocationNormalized`__, __`ContractType`__, and __`ContractTime`__.\n", + "\n", + "\n", + "You can read more [in the official description](https://www.kaggle.com/c/job-salary-prediction#description)." + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### Download the dataset (Thanks to [lowvoltage](https://lowvoltage.github.io/2017/07/29/Yadisk-Direct-Download-Python), edited by [AI-Ahmed](https://github.com/AI-Ahmed))" + ], + "metadata": { + "id": "M94Qg3pKew2l" + } + }, + { + "cell_type": "code", + "source": [ + "import sys\n", + "import requests\n", + "import zipfile\n", + "from glob import glob\n", + "from typing import Text\n", + "from tqdm.notebook import tqdm\n", + "from IPython.display import clear_output\n", + "\n", + "API_ENDPOINT = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key={}'\n", + "\n", + "def _get_real_direct_link(sharing_link):\n", + " pk_request = requests.get(API_ENDPOINT.format(sharing_link))\n", + " \n", + " # Returns None if the link cannot be \"converted\"\n", + " return pk_request.json().get('href')\n", + "\n", + "\n", + "def _extract_filename(direct_link):\n", + " print(\"Extracting the filename...\")\n", + " for chunk in direct_link.strip().split('&'):\n", + " if chunk.startswith('filename='):\n", + " return chunk.split('=')[1]\n", + " return None\n", + "\n", + "\n", + "def extract_csv_from_zip(filename: Text) -> None:\n", + " if glob(f'./*.zip'):\n", + " print(\"Extract dataset file...\")\n", + " try:\n", + " with zipfile.ZipFile(filename, 'r') as zipfle:\n", + " zipfle.extractall()\n", + " except:\n", + " sys.print(f\"There is no item named '{filename}' in the archive\")\n", + " clear_output()\n", + " print(\"The dataset has been extracted sucessfully!\")\n", + " else:\n", + " sys.exit(\"File `.zip` Not Found!\")\n", + " \n", + "\n", + "def download_yadisk_link(sharing_link, filename=None):\n", + " direct_link = _get_real_direct_link(sharing_link)\n", + " if direct_link:\n", + " # Try to recover the filename from the link\n", + " filename = filename or _extract_filename(direct_link)\n", + " clear_output()\n", + " \n", + " print(\"Downloading the file...\")\n", + " download = requests.get(direct_link)\n", + " with open(filename, 'wb') as out_file:\n", + " out_file.write(download.content)\n", + "\n", + " clear_output()\n", + " print('Downloaded \"{}\" to \"{}\"'.format(sharing_link, filename))\n", + " clear_output()\n", + " extract_csv_from_zip(filename)\n", + " else:\n", + " print('Failed to download \"{}\"'.format(sharing_link))" + ], + "metadata": { + "id": "8gg7lhfreCmm" + }, + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "download_yadisk_link(\"https://yadi.sk/d/vVEOWPFY3NruT7\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HwOBovvOdZuD", + "outputId": "c1213b58-8474-4e55-b2f7-975a0c1631df" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The dataset has been extracted sucessfully!\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": true, + "id": "ZTjSL2cCxYPZ", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 381 + }, + "outputId": "df32a5f0-83ca-4b9c-a719-878b6ade8936" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Id Title \\\n", + "92785 69182809 Residential care home manager Birmingham \n", + "97710 69268730 Lead Exploration Geophysicist \n", + "67167 68716041 IT Recruitment Manager – IT Recruitment Sector \n", + "\n", + " FullDescription LocationRaw \\\n", + "92785 We are looking for a Residential care home man... Birmingham \n", + "97710 This Australian oil and natural gas exploratio... Australia \n", + "67167 IT Recruitment Manager – IT Recruitment Sector... London South East \n", + "\n", + " LocationNormalized ContractType ContractTime Company \\\n", + "92785 Birmingham full_time NaN Purely Health Care \n", + "97710 UK NaN permanent NaN \n", + "67167 South East London NaN permanent 5Q \n", + "\n", + " Category \\\n", + "92785 Healthcare & Nursing Jobs \n", + "97710 Energy, Oil & Gas Jobs \n", + "67167 HR & Recruitment Jobs \n", + "\n", + " SalaryRaw SalaryNormalized \\\n", + "92785 20,000 - 35,000/Year 27500 \n", + "97710 150k+ AUD 150000 \n", + "67167 From 40,000 to 50,000 per annum + TEAM OVERRID... 45000 \n", + "\n", + " SourceName Log1pSalary \n", + "92785 staffnurse.com 10.221977 \n", + "97710 hays.co.uk 11.918397 \n", + "67167 totaljobs.com 10.714440 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdTitleFullDescriptionLocationRawLocationNormalizedContractTypeContractTimeCompanyCategorySalaryRawSalaryNormalizedSourceNameLog1pSalary
9278569182809Residential care home manager BirminghamWe are looking for a Residential care home man...BirminghamBirminghamfull_timeNaNPurely Health CareHealthcare & Nursing Jobs20,000 - 35,000/Year27500staffnurse.com10.221977
9771069268730Lead Exploration GeophysicistThis Australian oil and natural gas exploratio...AustraliaUKNaNpermanentNaNEnergy, Oil & Gas Jobs150k+ AUD150000hays.co.uk11.918397
6716768716041IT Recruitment Manager – IT Recruitment SectorIT Recruitment Manager – IT Recruitment Sector...London South EastSouth East LondonNaNpermanent5QHR & Recruitment JobsFrom 40,000 to 50,000 per annum + TEAM OVERRID...45000totaljobs.com10.714440
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 25 + } + ], + "source": [ + "data = pd.read_csv(\"./Train_rev1.csv\", index_col=None)\n", + "data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')\n", + "\n", + "text_columns = [\"Title\", \"FullDescription\"]\n", + "categorical_columns = [\"Category\", \"Company\", \"LocationNormalized\", \"ContractType\", \"ContractTime\"]\n", + "target_column = \"Log1pSalary\"\n", + "data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast nan to string\n", + "\n", + "data.sample(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-Fa1BQE0xYPb" + }, + "source": [ + "### The NLP part\n", + "\n", + "To even begin training our neural network, we're gonna need to preprocess the text features: tokenize it and build the token vocabularies.\n", + "\n", + "Since it is not an NLP course, we're gonna use simple built-in NLTK tokenization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "mATF2HFsxYPc" + }, + "outputs": [], + "source": [ + "print(\"Before\")\n", + "print(data[\"Title\"][::100000])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "3O8pdtlJxYPd" + }, + "outputs": [], + "source": [ + "import nltk\n", + "tokenizer = nltk.tokenize.WordPunctTokenizer()\n", + "\n", + "for col in text_columns:\n", + " data[col] = data[col].apply(lambda l: ' '.join(tokenizer.tokenize(str(l).lower())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TNoDy99RxYPe" + }, + "source": [ + "Now we can assume that our text is a space-separated list of tokens:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "PA_dDQQ7xYPf" + }, + "outputs": [], + "source": [ + "print(\"After\")\n", + "print(data[\"Title\"][::100000])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A1QZyGIlxYPh" + }, + "source": [ + "Not all words are equally useful. Some of them are typos or rare words that are only present a few times. \n", + "\n", + "Let's see how many times is each word present in the data so that we can build a \"white list\" of known words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "W3viwPQQxYPi" + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "token_counts = Counter()\n", + "\n", + "# Count how many times does each token occur in \"Title\" and \"FullDescription\"\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "gZLPO_FnxYPj" + }, + "outputs": [], + "source": [ + "print(\"Total unique tokens :\", len(token_counts))\n", + "print('\\n'.join(map(str, token_counts.most_common(n=5))))\n", + "print('...')\n", + "print('\\n'.join(map(str, token_counts.most_common()[-3:])))\n", + "\n", + "assert token_counts.most_common(1)[0][1] in range(2600000, 2700000)\n", + "assert len(token_counts) in range(200000, 210000)\n", + "print('Correct!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "tBw1JAiwxYPk" + }, + "outputs": [], + "source": [ + "# Let's see how many words are there for each count\n", + "\n", + "_=plt.hist(list(token_counts.values()), range=[0, 10**4], bins=50, log=True)\n", + "plt.xlabel(\"Counts\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NMb7Y7rFxYPl" + }, + "source": [ + "__Task 1.1__ Get a list of all tokens that occur at least 10 times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "TVprO2lRxYPl" + }, + "outputs": [], + "source": [ + "min_count = 10\n", + "\n", + "# tokens from token_counts keys that had at least min_count occurrences throughout the dataset\n", + "tokens = \n", + "\n", + "# Add a special tokens for unknown and empty words\n", + "UNK, PAD = \"UNK\", \"PAD\"\n", + "tokens = [UNK, PAD] + tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "xzkFT9z3xYPm" + }, + "outputs": [], + "source": [ + "print(\"Tokens left:\", len(tokens))\n", + "assert type(tokens)==list\n", + "assert len(tokens) in range(32000,35000)\n", + "assert 'me' in tokens\n", + "assert UNK in tokens\n", + "print(\"Correct!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YCLTYvBLxYPn" + }, + "source": [ + "__Task 1.2__ Build an inverse token index: a dictionary from token(string) to it's index in `tokens` (int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "_CLYvEA-xYPo" + }, + "outputs": [], + "source": [ + "token_to_id = " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "EVrX-1xfxYPp" + }, + "outputs": [], + "source": [ + "assert isinstance(token_to_id, dict)\n", + "assert len(token_to_id) == len(tokens)\n", + "for tok in tokens:\n", + " assert tokens[token_to_id[tok]] == tok\n", + "\n", + "print(\"Correct!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gJWXr60gxYPp" + }, + "source": [ + "And finally, let's use the vocabulary you've built to map text lines into torch-digestible matrices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "FDH_iVMexYPq" + }, + "outputs": [], + "source": [ + "UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])\n", + "\n", + "def as_matrix(sequences, max_len=None):\n", + " \"\"\" Convert a list of tokens into a matrix with padding \"\"\"\n", + " if isinstance(sequences[0], str):\n", + " sequences = list(map(str.split, sequences))\n", + " \n", + " max_len = min(max(map(len, sequences)), max_len or float('inf'))\n", + " \n", + " matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))\n", + " for i,seq in enumerate(sequences):\n", + " row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]\n", + " matrix[i, :len(row_ix)] = row_ix\n", + " \n", + " return matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "ZSpcpOsFxYPq" + }, + "outputs": [], + "source": [ + "#### print(\"Lines:\")\n", + "print('\\n'.join(data[\"Title\"][::100000].values), end='\\n\\n')\n", + "print(\"Matrix:\")\n", + "print(as_matrix(data[\"Title\"][::100000]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fSEIEnY2xYPq" + }, + "source": [ + "Now let's encode the categirical data we have.\n", + "\n", + "As usual, we shall use one-hot encoding for simplicity. Kudos if you implement tf-idf, target averaging or pseudo-counter-based encoding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "PoeHwkczxYPq" + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction import DictVectorizer\n", + "\n", + "# we only consider top-1k most frequent companies to minimize memory usage\n", + "top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))\n", + "recognized_companies = set(top_companies)\n", + "data[\"Company\"] = data[\"Company\"].apply(lambda comp: comp if comp in recognized_companies else \"Other\")\n", + "\n", + "categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)\n", + "categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wjb_HM0axYPr" + }, + "source": [ + "### The data science part\n", + "\n", + "Once we've learned to tokenize the data, let's design a machine learning experiment.\n", + "\n", + "As before, we won't focus too much on validation, opting for a simple train-test split.\n", + "\n", + "__To be completely rigorous,__ we've comitted a small crime here: we used the whole data for tokenization and vocabulary building. A more strict way would be to do that part on training set only. You may want to do that and measure the magnitude of changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "1y7dQLchxYPr" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "data_train, data_val = train_test_split(data, test_size=0.1, random_state=42)\n", + "\n", + "print(\"Train size = \", len(data_train))\n", + "print(\"Validation size = \", len(data_val))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "_mEmdhV3xYPs" + }, + "outputs": [], + "source": [ + "def generate_batch(data, batch_size=None, replace=True, max_len=None):\n", + " \"\"\"\n", + " Creates a pytorch-friendly dict from the batch data.\n", + " :returns: a dict with {'title' : int64[batch, title_max_len]\n", + " \"\"\"\n", + " if batch_size is not None:\n", + " data = data.sample(batch_size, replace=replace)\n", + " \n", + " batch = {}\n", + " for col in text_columns:\n", + " batch[col] = as_matrix(data[col].values, max_len)\n", + " \n", + " batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))\n", + " \n", + " if target_column in data.columns:\n", + " batch[target_column] = data[target_column].values\n", + " \n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "mREIidF7xYPs" + }, + "outputs": [], + "source": [ + "generate_batch(data_train, 3, max_len=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C11o0IznxYPt" + }, + "source": [ + "### Finally, let's talk deep learning\n", + "\n", + "Out model consists of three branches:\n", + "* Title encoder\n", + "* Description encoder\n", + "* Categorical features encoder\n", + "\n", + "We will then feed all 3 branches into one common network that predicts salary.\n", + "\n", + "![scheme](https://github.com/yandexdataschool/Practical_DL/raw/master/homework04/conv_salary_architecture.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GP5w1jX2xYPt" + }, + "source": [ + "By default, both text vectorizers shall use 1d convolutions, followed by global pooling over time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "PEtzjT_axYPt" + }, + "outputs": [], + "source": [ + "import torch, torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.autograd import Variable\n", + "\n", + "class GlobalMaxPooling(nn.Module):\n", + " def __init__(self, dim=-1):\n", + " super(self.__class__, self).__init__()\n", + " self.dim = dim\n", + " \n", + " def forward(self, x):\n", + " return x.max(dim=self.dim)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "joss8iRhxYPu" + }, + "outputs": [], + "source": [ + "class TitleEncoder(nn.Module):\n", + " def __init__(self, n_tokens=len(tokens), out_size=64):\n", + " \"\"\" \n", + " A simple sequential encoder for titles.\n", + " x -> emb -> conv -> global_max -> relu -> dense\n", + " \"\"\"\n", + " super(self.__class__, self).__init__()\n", + " self.emb = nn.Embedding(n_tokens, 64, padding_idx=PAD_IX)\n", + " self.conv1 = nn.Conv1d(64, out_size, kernel_size=3, padding=1)\n", + " self.pool1 = GlobalMaxPooling() \n", + " self.dense = nn.Linear(out_size, out_size)\n", + "\n", + " def forward(self, text_ix):\n", + " \"\"\"\n", + " :param text_ix: int64 Variable of shape [batch_size, max_len]\n", + " :returns: float32 Variable of shape [batch_size, out_size]\n", + " \"\"\"\n", + " h = self.emb(text_ix)\n", + "\n", + " # we transpose from [batch, time, units] to [batch, units, time] to fit Conv1d dim order\n", + " h = torch.transpose(h, 1, 2)\n", + " \n", + " # Apply the layers as defined above. Add some ReLUs before dense.\n", + " \n", + " \n", + " return " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "UdB1WMfuxYPv" + }, + "outputs": [], + "source": [ + "title_encoder = TitleEncoder(out_size=64)\n", + "\n", + "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['Title']))\n", + "dummy_v = title_encoder(dummy_x)\n", + "\n", + "assert isinstance(dummy_v, Variable)\n", + "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n", + "\n", + "del title_encoder\n", + "print(\"Seems fine\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dl4zkBYcxYPv" + }, + "source": [ + "__Task 2.1__ Create description encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "JTABwuBqxYPw" + }, + "outputs": [], + "source": [ + "# Define an encoder for job descriptions.\n", + "# Use any means you want so long as it's torch.nn.Module.\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "k-rtbGlbxYPw" + }, + "outputs": [], + "source": [ + "desc_encoder = \n", + "\n", + "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['FullDescription']))\n", + "dummy_v = desc_encoder(dummy_x)\n", + "\n", + "assert isinstance(dummy_v, Variable)\n", + "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n", + "del desc_encoder\n", + "print(\"Seems fine too\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iNQWQUw3xYPw" + }, + "source": [ + "__ Task 2.2__ Build one network ~~to rule them all~~" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "AhSwatjhxYPx" + }, + "outputs": [], + "source": [ + "class FullNetwork(nn.Module):\n", + " \"\"\"\n", + " This class does all the steps from (title, desc, categorical) features -> predicted target\n", + " It unites title & desc encoders you defined above as long as some layers for head and categorical branch.\n", + " \"\"\"\n", + " \n", + " def __init__(self, n_tokens=len(tokens), n_cat_features=len(categorical_vectorizer.vocabulary_)):\n", + " super(self.__class__, self).__init__()\n", + " \n", + " self.title_encoder = TitleEncoder(out_size=64)\n", + " self.desc_encoder = \n", + " \n", + " # define layers for categorical features. A few dense layers would do.\n", + " \n", + " \n", + " # define \"output\" layers that process depend the three encoded vectors into answer\n", + " \n", + " \n", + " \n", + " def forward(self, title_ix, desc_ix, cat_features):\n", + " \"\"\"\n", + " :param title_ix: int32 Variable [batch, title_len], job titles encoded by as_matrix\n", + " :param desc_ix: int32 Variable [batch, desc_len] , job descriptions encoded by as_matrix\n", + " :param cat_features: float32 Variable [batch, n_cat_features]\n", + " :returns: float32 Variable 1d [batch], predicted log1p-salary\n", + " \"\"\"\n", + " \n", + " # process each data source with it's respective encoder\n", + " title_h = self.title_encoder(title_ix)\n", + " desc_h = \n", + " \n", + " # apply categorical encoder\n", + " cat_h = \n", + " \n", + " # concatenate all vectors together...\n", + " joint_h = torch.cat([title_h, desc_h, cat_h], dim=1)\n", + " \n", + " # ... and stack a few more layers at the top\n", + " \n", + " \n", + " # Note 1: do not forget to select first columns, [:, 0], to get to 1d outputs\n", + " # Note 2: please do not use output nonlinearities.\n", + " \n", + " return " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "1y7jbwTGxYPy" + }, + "outputs": [], + "source": [ + "model = FullNetwork()\n", + "opt = torch.optim.Adam(model.parameters(), lr=1e-3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "P4FEQid5xYPy" + }, + "outputs": [], + "source": [ + "# test it on one batch\n", + "\n", + "batch = generate_batch(data_train, 32)\n", + "\n", + "title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", + "desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", + "cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", + "reference = Variable(torch.FloatTensor(batch[target_column]))\n", + "\n", + "prediction = model(title_ix, desc_ix, cat_features)\n", + "\n", + "assert len(prediction.shape) == 1 and prediction.shape[0] == title_ix.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "Zp__hcLFxYPz" + }, + "outputs": [], + "source": [ + "def compute_loss(reference, prediction):\n", + " \"\"\"\n", + " Computes objective for minimization.\n", + " By deafult we minimize MSE, but you are encouraged to try mix up MSE, MAE, huber loss, etc.\n", + " \"\"\"\n", + " return torch.mean((prediction - reference) ** 2)\n", + "\n", + "def compute_mae(reference, prediction):\n", + " \"\"\" Compute MAE on actual salary, assuming your model outputs log1p(salary)\"\"\"\n", + " return torch.abs(torch.exp(reference - 1) - torch.exp(prediction - 1)).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "4aryeHS1xYPz" + }, + "outputs": [], + "source": [ + "loss = compute_loss(reference, prediction)\n", + "dummy_grads = torch.autograd.grad(loss, model.parameters(), retain_graph=True)\n", + "for grad in dummy_grads:\n", + " assert grad is not None and not (grad == 0).all(), \"Some model parameters received zero grads. \" \\\n", + " \"Double-check that your model uses all it's layers.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dKlUtZumxYPz" + }, + "source": [ + "### Let's train it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "MniUXvdGxYP0" + }, + "outputs": [], + "source": [ + "from tqdm import tnrange\n", + "def iterate_minibatches(data, batch_size=32, max_len=None,\n", + " max_batches=None, shuffle=True, verbose=True):\n", + " indices = np.arange(len(data))\n", + " if shuffle:\n", + " indices = np.random.permutation(indices)\n", + " if max_batches is not None:\n", + " indices = indices[: batch_size * max_batches]\n", + " \n", + " irange = tnrange if verbose else range\n", + " \n", + " for start in irange(0, len(indices), batch_size):\n", + " yield generate_batch(data.iloc[indices[start : start + batch_size]], max_len=max_len)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "pVAHg1Z8xYP1" + }, + "outputs": [], + "source": [ + "num_epochs = 100\n", + "max_len = 100\n", + "batch_size = 32\n", + "batches_per_epoch = 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "WHie5jdjxYP1" + }, + "outputs": [], + "source": [ + "for epoch_i in range(num_epochs):\n", + " \n", + " print(\"Training:\")\n", + " train_loss = train_mae = train_batches = 0 \n", + " model.train(True)\n", + " \n", + " for batch in iterate_minibatches(data_train, max_batches=batches_per_epoch):\n", + "\n", + " title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", + " desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", + " cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", + " reference = Variable(torch.FloatTensor(batch[target_column]))\n", + "\n", + " prediction = model(title_ix, desc_ix, cat_features)\n", + "\n", + " loss = compute_loss(reference, prediction)\n", + " loss.backward()\n", + " opt.step()\n", + " opt.zero_grad()\n", + "\n", + " train_loss += loss.data.numpy()\n", + " train_mae += compute_mae(reference, prediction).data.numpy()\n", + " train_batches += 1\n", + " \n", + " print(\"\\tLoss:\\t%.5f\" % (train_loss / train_batches))\n", + " print(\"\\tMAE:\\t%.5f\" % (train_mae / train_batches))\n", + " print('\\n\\n')\n", + " \n", + " print(\"Validation:\")\n", + " val_loss = val_mae = val_batches = 0\n", + " model.train(False)\n", + " \n", + " with torch.no_grad():\n", + " for batch in iterate_minibatches(data_val, shuffle=False):\n", + " title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", + " desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", + " cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", + " reference = Variable(torch.FloatTensor(batch[target_column]))\n", + "\n", + " prediction = model(title_ix, desc_ix, cat_features)\n", + " loss = compute_loss(reference, prediction)\n", + "\n", + " val_loss += loss.data.numpy()\n", + " val_mae += compute_mae(reference, prediction).data.numpy()\n", + " val_batches += 1\n", + "\n", + " print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n", + " print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n", + " print('\\n\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "5kJvB3CRxYP2" + }, + "outputs": [], + "source": [ + "print(\"Final eval:\")\n", + "val_loss = val_mae = val_batches = 0\n", + "\n", + "with torch.no_grad():\n", + " for batch in iterate_minibatches(data_val, shuffle=False):\n", + " title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", + " desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", + " cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", + " reference = Variable(torch.FloatTensor(batch[target_column]))\n", + "\n", + " prediction = model(title_ix, desc_ix, cat_features)\n", + " loss = compute_loss(reference, prediction)\n", + "\n", + " val_loss += loss.data.numpy()\n", + " val_mae += compute_mae(reference, prediction).data.numpy()\n", + " val_batches += 1\n", + "\n", + "print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n", + "print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n", + "print('\\n\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yAVbs759xYP2" + }, + "source": [ + "### Task 3.2: Actually make it work\n", + "\n", + "Your main task is to use some of the tricks you've learned on the network and analyze if you can improve __validation MAE__.\n", + "\n", + "Try __at least 3 options__ from the list below for a passing grade. If you're into \n", + "\n", + "#### A) CNN architecture\n", + "\n", + "All the tricks you know about dense and convolutional neural networks apply here as well.\n", + "* Dropout. Nuff said.\n", + "* Batch Norm. This time it's `nn.BatchNorm1d`\n", + "* Parallel convolution layers. The idea is that you apply several nn.Conv1d to the same embeddings and concatenate output channels.\n", + "* More layers, more neurons, ya know...\n", + "\n", + "\n", + "#### B) Play with pooling\n", + "\n", + "There's more than one way to do max pooling:\n", + "* Max over time - our `GlobalMaxPooling`\n", + "* Average over time (excluding PAD)\n", + "* Softmax-pooling:\n", + "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot {{e ^ {h_{i, t}}} \\over \\sum_\\tau e ^ {h_{j, \\tau}} } }$$\n", + "\n", + "* Attentive pooling\n", + "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot Attn(h_t)}$$\n", + "\n", + ", where $$ Attn(h_t) = {{e ^ {NN_{attn}(h_t)}} \\over \\sum_\\tau e ^ {NN_{attn}(h_\\tau)}} $$\n", + "and $NN_{attn}$ is a small neural network\n", + "\n", + "\n", + "The optimal score is usually achieved by concatenating several different poolings, including several attentive pooling with different $NN_{attn}$\n", + "\n", + "#### C) Fun with embeddings\n", + "\n", + "It's not always a good idea to train embeddings from scratch. Here's a few tricks:\n", + "\n", + "* Use a pre-trained word2vec from [here](http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/) or [here](http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/).\n", + "* Start with pre-trained embeddings, then fine-tune them with gradient descent\n", + "* Use the same embedding matrix in title and desc vectorizer\n", + "\n", + "#### D) Going recurrent\n", + "\n", + "We've already learned that recurrent networks can do cool stuff in sequence modelling. Turns out, they're not useless for classification as well. With some tricks of course..\n", + "\n", + "* Like convolutional layers, LSTM should be pooled into a fixed-size vector with some of the poolings.\n", + " * Please bear in mind that while convolution uses [batch, units, time] dim order, \n", + " recurrent units are built for [batch, time, unit]. You may need to `torch.transpose`.\n", + "\n", + "* Since you know all the text in advance, use bidirectional RNN\n", + " * Run one LSTM from left to right\n", + " * Run another in parallel from right to left \n", + " * Concatenate their output sequences along unit axis (dim=-1)\n", + "\n", + "* It might be good idea to mix convolutions and recurrent layers differently for title and description\n", + "\n", + "\n", + "#### E) Optimizing seriously\n", + "\n", + "* You don't necessarily need 100 epochs. Use early stopping. If you've never done this before, take a look at [keras](https://github.com/keras-team/keras/blob/master/keras/callbacks.py#L461) for inspiration.\n", + " * In short, train until you notice that validation\n", + " * Maintain the best-on-validation snapshot via `model.state_dict`\n", + " * Plotting learning curves is usually a good idea" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yRyIn5htxYP3" + }, + "source": [ + "### A short report\n", + "\n", + "Please tell us what you did and how did it work.\n", + "\n", + "``, i guess..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "tDSMuYg3xYP4" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "colab": { + "provenance": [] + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### About the challenge\n", - "For starters, let's download the data from __[here](https://yadi.sk/d/vVEOWPFY3NruT7)__.\n", - "\n", - "You can also get it from the competition [page](https://www.kaggle.com/c/job-salary-prediction/data) (in that case, pick `Train_rev1.*`).\n", - "\n", - "\n", - "Our task is to predict one number, __SalaryNormalized__, in the sense of minimizing __Mean Absolute Error__.\n", - "\n", - "\n", - "\n", - "To do so, our model ca access a number of features:\n", - "* Free text: __`Title`__ and __`FullDescription`__\n", - "* Categorical: __`Category`__, __`Company`__, __`LocationNormalized`__, __`ContractType`__, and __`ContractTime`__.\n", - "\n", - "\n", - "You can read more [in the official description](https://www.kaggle.com/c/job-salary-prediction#description)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "data = pd.read_csv(\"./Train_rev1.csv\", index_col=None)\n", - "data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')\n", - "\n", - "text_columns = [\"Title\", \"FullDescription\"]\n", - "categorical_columns = [\"Category\", \"Company\", \"LocationNormalized\", \"ContractType\", \"ContractTime\"]\n", - "target_column = \"Log1pSalary\"\n", - "data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast nan to string\n", - "\n", - "data.sample(3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The NLP part\n", - "\n", - "To even begin training our neural network, we're gonna need to preprocess the text features: tokenize it and build the token vocabularies.\n", - "\n", - "Since it is not an NLP course, we're gonna use simple built-in NLTK tokenization." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"Before\")\n", - "print(data[\"Title\"][::100000])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import nltk\n", - "tokenizer = nltk.tokenize.WordPunctTokenizer()\n", - "\n", - "for col in text_columns:\n", - " data[col] = data[col].apply(lambda l: ' '.join(tokenizer.tokenize(str(l).lower())))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can assume that our text is a space-separated list of tokens:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"After\")\n", - "print(data[\"Title\"][::100000])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Not all words are equally useful. Some of them are typos or rare words that are only present a few times. \n", - "\n", - "Let's see how many times is each word present in the data so that we can build a \"white list\" of known words." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from collections import Counter\n", - "token_counts = Counter()\n", - "\n", - "# Count how many times does each token occur in \"Title\" and \"FullDescription\"\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"Total unique tokens :\", len(token_counts))\n", - "print('\\n'.join(map(str, token_counts.most_common(n=5))))\n", - "print('...')\n", - "print('\\n'.join(map(str, token_counts.most_common()[-3:])))\n", - "\n", - "assert token_counts.most_common(1)[0][1] in range(2600000, 2700000)\n", - "assert len(token_counts) in range(200000, 210000)\n", - "print('Correct!')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Let's see how many words are there for each count\n", - "\n", - "_=plt.hist(list(token_counts.values()), range=[0, 10**4], bins=50, log=True)\n", - "plt.xlabel(\"Counts\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Task 1.1__ Get a list of all tokens that occur at least 10 times." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "min_count = 10\n", - "\n", - "# tokens from token_counts keys that had at least min_count occurrences throughout the dataset\n", - "tokens = \n", - "\n", - "# Add a special tokens for unknown and empty words\n", - "UNK, PAD = \"UNK\", \"PAD\"\n", - "tokens = [UNK, PAD] + tokens" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"Tokens left:\", len(tokens))\n", - "assert type(tokens)==list\n", - "assert len(tokens) in range(32000,35000)\n", - "assert 'me' in tokens\n", - "assert UNK in tokens\n", - "print(\"Correct!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Task 1.2__ Build an inverse token index: a dictionary from token(string) to it's index in `tokens` (int)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "token_to_id = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "assert isinstance(token_to_id, dict)\n", - "assert len(token_to_id) == len(tokens)\n", - "for tok in tokens:\n", - " assert tokens[token_to_id[tok]] == tok\n", - "\n", - "print(\"Correct!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And finally, let's use the vocabulary you've built to map text lines into torch-digestible matrices." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])\n", - "\n", - "def as_matrix(sequences, max_len=None):\n", - " \"\"\" Convert a list of tokens into a matrix with padding \"\"\"\n", - " if isinstance(sequences[0], str):\n", - " sequences = list(map(str.split, sequences))\n", - " \n", - " max_len = min(max(map(len, sequences)), max_len or float('inf'))\n", - " \n", - " matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))\n", - " for i,seq in enumerate(sequences):\n", - " row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]\n", - " matrix[i, :len(row_ix)] = row_ix\n", - " \n", - " return matrix" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "#### print(\"Lines:\")\n", - "print('\\n'.join(data[\"Title\"][::100000].values), end='\\n\\n')\n", - "print(\"Matrix:\")\n", - "print(as_matrix(data[\"Title\"][::100000]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's encode the categirical data we have.\n", - "\n", - "As usual, we shall use one-hot encoding for simplicity. Kudos if you implement tf-idf, target averaging or pseudo-counter-based encoding." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from sklearn.feature_extraction import DictVectorizer\n", - "\n", - "# we only consider top-1k most frequent companies to minimize memory usage\n", - "top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))\n", - "recognized_companies = set(top_companies)\n", - "data[\"Company\"] = data[\"Company\"].apply(lambda comp: comp if comp in recognized_companies else \"Other\")\n", - "\n", - "categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)\n", - "categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The data science part\n", - "\n", - "Once we've learned to tokenize the data, let's design a machine learning experiment.\n", - "\n", - "As before, we won't focus too much on validation, opting for a simple train-test split.\n", - "\n", - "__To be completely rigorous,__ we've comitted a small crime here: we used the whole data for tokenization and vocabulary building. A more strict way would be to do that part on training set only. You may want to do that and measure the magnitude of changes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "data_train, data_val = train_test_split(data, test_size=0.1, random_state=42)\n", - "\n", - "print(\"Train size = \", len(data_train))\n", - "print(\"Validation size = \", len(data_val))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def generate_batch(data, batch_size=None, replace=True, max_len=None):\n", - " \"\"\"\n", - " Creates a pytorch-friendly dict from the batch data.\n", - " :returns: a dict with {'title' : int64[batch, title_max_len]\n", - " \"\"\"\n", - " if batch_size is not None:\n", - " data = data.sample(batch_size, replace=replace)\n", - " \n", - " batch = {}\n", - " for col in text_columns:\n", - " batch[col] = as_matrix(data[col].values, max_len)\n", - " \n", - " batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))\n", - " \n", - " if target_column in data.columns:\n", - " batch[target_column] = data[target_column].values\n", - " \n", - " return batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "generate_batch(data_train, 3, max_len=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Finally, let's talk deep learning\n", - "\n", - "Out model consists of three branches:\n", - "* Title encoder\n", - "* Description encoder\n", - "* Categorical features encoder\n", - "\n", - "We will then feed all 3 branches into one common network that predicts salary.\n", - "\n", - "![scheme](https://github.com/yandexdataschool/Practical_DL/raw/master/homework04/conv_salary_architecture.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, both text vectorizers shall use 1d convolutions, followed by global pooling over time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import torch, torch.nn as nn\n", - "import torch.nn.functional as F\n", - "from torch.autograd import Variable\n", - "\n", - "class GlobalMaxPooling(nn.Module):\n", - " def __init__(self, dim=-1):\n", - " super(self.__class__, self).__init__()\n", - " self.dim = dim\n", - " \n", - " def forward(self, x):\n", - " return x.max(dim=self.dim)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "class TitleEncoder(nn.Module):\n", - " def __init__(self, n_tokens=len(tokens), out_size=64):\n", - " \"\"\" \n", - " A simple sequential encoder for titles.\n", - " x -> emb -> conv -> global_max -> relu -> dense\n", - " \"\"\"\n", - " super(self.__class__, self).__init__()\n", - " self.emb = nn.Embedding(n_tokens, 64, padding_idx=PAD_IX)\n", - " self.conv1 = nn.Conv1d(64, out_size, kernel_size=3, padding=1)\n", - " self.pool1 = GlobalMaxPooling() \n", - " self.dense = nn.Linear(out_size, out_size)\n", - "\n", - " def forward(self, text_ix):\n", - " \"\"\"\n", - " :param text_ix: int64 Variable of shape [batch_size, max_len]\n", - " :returns: float32 Variable of shape [batch_size, out_size]\n", - " \"\"\"\n", - " h = self.emb(text_ix)\n", - "\n", - " # we transpose from [batch, time, units] to [batch, units, time] to fit Conv1d dim order\n", - " h = torch.transpose(h, 1, 2)\n", - " \n", - " # Apply the layers as defined above. Add some ReLUs before dense.\n", - " \n", - " \n", - " return " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "title_encoder = TitleEncoder(out_size=64)\n", - "\n", - "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['Title']))\n", - "dummy_v = title_encoder(dummy_x)\n", - "\n", - "assert isinstance(dummy_v, Variable)\n", - "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n", - "\n", - "del title_encoder\n", - "print(\"Seems fine\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Task 2.1__ Create description encoder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Define an encoder for job descriptions.\n", - "# Use any means you want so long as it's torch.nn.Module.\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "desc_encoder = \n", - "\n", - "dummy_x = Variable(torch.LongTensor(generate_batch(data_train, 3)['FullDescription']))\n", - "dummy_v = desc_encoder(dummy_x)\n", - "\n", - "assert isinstance(dummy_v, Variable)\n", - "assert tuple(dummy_v.shape) == (dummy_x.shape[0], 64)\n", - "del desc_encoder\n", - "print(\"Seems fine too\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__ Task 2.2__ Build one network ~~to rule them all~~" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "class FullNetwork(nn.Module):\n", - " \"\"\"\n", - " This class does all the steps from (title, desc, categorical) features -> predicted target\n", - " It unites title & desc encoders you defined above as long as some layers for head and categorical branch.\n", - " \"\"\"\n", - " \n", - " def __init__(self, n_tokens=len(tokens), n_cat_features=len(categorical_vectorizer.vocabulary_)):\n", - " super(self.__class__, self).__init__()\n", - " \n", - " self.title_encoder = TitleEncoder(out_size=64)\n", - " self.desc_encoder = \n", - " \n", - " # define layers for categorical features. A few dense layers would do.\n", - " \n", - " \n", - " # define \"output\" layers that process depend the three encoded vectors into answer\n", - " \n", - " \n", - " \n", - " def forward(self, title_ix, desc_ix, cat_features):\n", - " \"\"\"\n", - " :param title_ix: int32 Variable [batch, title_len], job titles encoded by as_matrix\n", - " :param desc_ix: int32 Variable [batch, desc_len] , job descriptions encoded by as_matrix\n", - " :param cat_features: float32 Variable [batch, n_cat_features]\n", - " :returns: float32 Variable 1d [batch], predicted log1p-salary\n", - " \"\"\"\n", - " \n", - " # process each data source with it's respective encoder\n", - " title_h = self.title_encoder(title_ix)\n", - " desc_h = \n", - " \n", - " # apply categorical encoder\n", - " cat_h = \n", - " \n", - " # concatenate all vectors together...\n", - " joint_h = torch.cat([title_h, desc_h, cat_h], dim=1)\n", - " \n", - " # ... and stack a few more layers at the top\n", - " \n", - " \n", - " # Note 1: do not forget to select first columns, [:, 0], to get to 1d outputs\n", - " # Note 2: please do not use output nonlinearities.\n", - " \n", - " return " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "model = FullNetwork()\n", - "opt = torch.optim.Adam(model.parameters(), lr=1e-3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# test it on one batch\n", - "\n", - "batch = generate_batch(data_train, 32)\n", - "\n", - "title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", - "desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", - "cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", - "reference = Variable(torch.FloatTensor(batch[target_column]))\n", - "\n", - "prediction = model(title_ix, desc_ix, cat_features)\n", - "\n", - "assert len(prediction.shape) == 1 and prediction.shape[0] == title_ix.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def compute_loss(reference, prediction):\n", - " \"\"\"\n", - " Computes objective for minimization.\n", - " By deafult we minimize MSE, but you are encouraged to try mix up MSE, MAE, huber loss, etc.\n", - " \"\"\"\n", - " return torch.mean((prediction - reference) ** 2)\n", - "\n", - "def compute_mae(reference, prediction):\n", - " \"\"\" Compute MAE on actual salary, assuming your model outputs log1p(salary)\"\"\"\n", - " return torch.abs(torch.exp(reference - 1) - torch.exp(prediction - 1)).mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "loss = compute_loss(reference, prediction)\n", - "dummy_grads = torch.autograd.grad(loss, model.parameters(), retain_graph=True)\n", - "for grad in dummy_grads:\n", - " assert grad is not None and not (grad == 0).all(), \"Some model parameters received zero grads. \" \\\n", - " \"Double-check that your model uses all it's layers.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Let's train it!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from tqdm import tnrange\n", - "def iterate_minibatches(data, batch_size=32, max_len=None,\n", - " max_batches=None, shuffle=True, verbose=True):\n", - " indices = np.arange(len(data))\n", - " if shuffle:\n", - " indices = np.random.permutation(indices)\n", - " if max_batches is not None:\n", - " indices = indices[: batch_size * max_batches]\n", - " \n", - " irange = tnrange if verbose else range\n", - " \n", - " for start in irange(0, len(indices), batch_size):\n", - " yield generate_batch(data.iloc[indices[start : start + batch_size]], max_len=max_len)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "num_epochs = 100\n", - "max_len = 100\n", - "batch_size = 32\n", - "batches_per_epoch = 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "for epoch_i in range(num_epochs):\n", - " \n", - " print(\"Training:\")\n", - " train_loss = train_mae = train_batches = 0 \n", - " model.train(True)\n", - " \n", - " for batch in iterate_minibatches(data_train, max_batches=batches_per_epoch):\n", - "\n", - " title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", - " desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", - " cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", - " reference = Variable(torch.FloatTensor(batch[target_column]))\n", - "\n", - " prediction = model(title_ix, desc_ix, cat_features)\n", - "\n", - " loss = compute_loss(reference, prediction)\n", - " loss.backward()\n", - " opt.step()\n", - " opt.zero_grad()\n", - "\n", - " train_loss += loss.data.numpy()\n", - " train_mae += compute_mae(reference, prediction).data.numpy()\n", - " train_batches += 1\n", - " \n", - " print(\"\\tLoss:\\t%.5f\" % (train_loss / train_batches))\n", - " print(\"\\tMAE:\\t%.5f\" % (train_mae / train_batches))\n", - " print('\\n\\n')\n", - " \n", - " print(\"Validation:\")\n", - " val_loss = val_mae = val_batches = 0\n", - " model.train(False)\n", - " \n", - " with torch.no_grad():\n", - " for batch in iterate_minibatches(data_val, shuffle=False):\n", - " title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", - " desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", - " cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", - " reference = Variable(torch.FloatTensor(batch[target_column]))\n", - "\n", - " prediction = model(title_ix, desc_ix, cat_features)\n", - " loss = compute_loss(reference, prediction)\n", - "\n", - " val_loss += loss.data.numpy()\n", - " val_mae += compute_mae(reference, prediction).data.numpy()\n", - " val_batches += 1\n", - "\n", - " print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n", - " print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n", - " print('\\n\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "print(\"Final eval:\")\n", - "val_loss = val_mae = val_batches = 0\n", - "\n", - "with torch.no_grad():\n", - " for batch in iterate_minibatches(data_val, shuffle=False):\n", - " title_ix = Variable(torch.LongTensor(batch[\"Title\"]))\n", - " desc_ix = Variable(torch.LongTensor(batch[\"FullDescription\"]))\n", - " cat_features = Variable(torch.FloatTensor(batch[\"Categorical\"]))\n", - " reference = Variable(torch.FloatTensor(batch[target_column]))\n", - "\n", - " prediction = model(title_ix, desc_ix, cat_features)\n", - " loss = compute_loss(reference, prediction)\n", - "\n", - " val_loss += loss.data.numpy()\n", - " val_mae += compute_mae(reference, prediction).data.numpy()\n", - " val_batches += 1\n", - "\n", - "print(\"\\tLoss:\\t%.5f\" % (val_loss / val_batches))\n", - "print(\"\\tMAE:\\t%.5f\" % (val_mae / val_batches))\n", - "print('\\n\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Task 3.2: Actually make it work\n", - "\n", - "Your main task is to use some of the tricks you've learned on the network and analyze if you can improve __validation MAE__.\n", - "\n", - "Try __at least 3 options__ from the list below for a passing grade. If you're into \n", - "\n", - "#### A) CNN architecture\n", - "\n", - "All the tricks you know about dense and convolutional neural networks apply here as well.\n", - "* Dropout. Nuff said.\n", - "* Batch Norm. This time it's `nn.BatchNorm1d`\n", - "* Parallel convolution layers. The idea is that you apply several nn.Conv1d to the same embeddings and concatenate output channels.\n", - "* More layers, more neurons, ya know...\n", - "\n", - "\n", - "#### B) Play with pooling\n", - "\n", - "There's more than one way to do max pooling:\n", - "* Max over time - our `GlobalMaxPooling`\n", - "* Average over time (excluding PAD)\n", - "* Softmax-pooling:\n", - "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot {{e ^ {h_{i, t}}} \\over \\sum_\\tau e ^ {h_{j, \\tau}} } }$$\n", - "\n", - "* Attentive pooling\n", - "$$ out_{i, t} = \\sum_t {h_{i,t} \\cdot Attn(h_t)}$$\n", - "\n", - ", where $$ Attn(h_t) = {{e ^ {NN_{attn}(h_t)}} \\over \\sum_\\tau e ^ {NN_{attn}(h_\\tau)}} $$\n", - "and $NN_{attn}$ is a small neural network\n", - "\n", - "\n", - "The optimal score is usually achieved by concatenating several different poolings, including several attentive pooling with different $NN_{attn}$\n", - "\n", - "#### C) Fun with embeddings\n", - "\n", - "It's not always a good idea to train embeddings from scratch. Here's a few tricks:\n", - "\n", - "* Use a pre-trained word2vec from [here](http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/) or [here](http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/).\n", - "* Start with pre-trained embeddings, then fine-tune them with gradient descent\n", - "* Use the same embedding matrix in title and desc vectorizer\n", - "\n", - "#### D) Going recurrent\n", - "\n", - "We've already learned that recurrent networks can do cool stuff in sequence modelling. Turns out, they're not useless for classification as well. With some tricks of course..\n", - "\n", - "* Like convolutional layers, LSTM should be pooled into a fixed-size vector with some of the poolings.\n", - " * Please bear in mind that while convolution uses [batch, units, time] dim order, \n", - " recurrent units are built for [batch, time, unit]. You may need to `torch.transpose`.\n", - "\n", - "* Since you know all the text in advance, use bidirectional RNN\n", - " * Run one LSTM from left to right\n", - " * Run another in parallel from right to left \n", - " * Concatenate their output sequences along unit axis (dim=-1)\n", - "\n", - "* It might be good idea to mix convolutions and recurrent layers differently for title and description\n", - "\n", - "\n", - "#### E) Optimizing seriously\n", - "\n", - "* You don't necessarily need 100 epochs. Use early stopping. If you've never done this before, take a look at [keras](https://github.com/keras-team/keras/blob/master/keras/callbacks.py#L461) for inspiration.\n", - " * In short, train until you notice that validation\n", - " * Maintain the best-on-validation snapshot via `model.state_dict`\n", - " * Plotting learning curves is usually a good idea" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### A short report\n", - "\n", - "Please tell us what you did and how did it work.\n", - "\n", - "``, i guess..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 0 }