diff --git a/NLP/1_2/HW2_Solonin_Spam detection.ipynb b/NLP/1_2/HW2_Solonin_Spam detection.ipynb new file mode 100644 index 0000000..22a2d48 --- /dev/null +++ b/NLP/1_2/HW2_Solonin_Spam detection.ipynb @@ -0,0 +1,663 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Homework 2: Spam Detection\n", + "#### *Author: Maxim Solonin*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Download sms-spam dataset https://archive.ics.uci.edu/ml/ datasets/sms+spam+collection\n", + "- Choose and argument metric for quality\n", + "- Code «by a hands» naive bayes for spam detection task;\n", + "- Choose a measure of a test's accuracy and argument your choice;\n", + "- Perform 5-fold validation for this task;\n", + "- Compare your results with sklearn naive_bayes;\n", + "- I expect your result as self-sufficient (with all comments/graph/etc.) Jupiter notebook in your GitHub in 2 weeks (next lecture)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 263, + "metadata": {}, + "outputs": [], + "source": [ + "# importing librarires\n", + "import scipy\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "import pandas as pd\n", + "from random import randrange\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.model_selection import KFold\n", + "from nltk import stem\n", + "from nltk.corpus import stopwords\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import seaborn as sns\n", + "\n", + "from matplotlib import pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LabelText
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
\n", + "
" + ], + "text/plain": [ + " Label Text\n", + "0 ham Go until jurong point, crazy.. Available only ...\n", + "1 ham Ok lar... Joking wif u oni...\n", + "2 spam Free entry in 2 a wkly comp to win FA Cup fina..." + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# reading the dataset\n", + "df = pd.read_csv('C:\\\\Users\\\\Maxim\\\\Desktop\\\\Education\\\\GSOM\\\\Natural Language and Image Processing\\\\HW\\\\HW2\\\\SMSSpamCollection', sep='\\t', names=[\"Label\", \"Text\"])\n", + "df.head(n=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, print the number of occurrences of each class in order to check for potential sample imbalance." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Spam/ham distribution')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAF4RJREFUeJzt3XuYHXWd5/H3Ry6igoISEBKYMJrdAUW8RMDbDCOzCDoK64ri6BIZZtBdncs+rg767AiKPKM7jqh4mWVWJHgDvC1RWSELqOOFSzIiV5EIKDFIggEEReTy3T/q13LSdne6Qp/uDv1+PU+eU/WrX1V9z8l5zqd/VXXqpKqQJGmyHjHTBUiSNi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxODRnJTk+yadmug7YsJYkuye5K8kWU7Ttf07y9236gCSrp2K7bXsvSHLtVG1PmweDQ0OX5PlJvpPkjiTrk3w7ybNnsJ7zkhw0U/vfmKr6SVVtW1X3T9QvyeuSfGsS23tDVZ0wFbUlqSRPHtj2v1bVv5+KbWvzYXBoqJI8FvgKcDLweGA+8E7gnhmq5zHAs4BvzMT+p9tUjVqkQQaHhu3fAVTVZ6vq/qq6u6rOq6rL4bd/NX87ycltRPKDJAeOrJzkqCTXJLkzyfVJXj+w7IAkq5O8NcnaJDcnOSzJi5P8sI1u3j6qngOBb1fVSHBtneT0tv2rkiwe2P6xSX7Ull2d5D8OLBup+6Qkt7fantvab2r1LBnvRUmyR5JvtG0vB3YcWLaw/WW/5cC+rm99b0jymiR7Av8MPKcd1rq99T0tyceSnJPkl8Aft7Z3j9r/25PcmuTGJK8ZaP96kr8Y9Ty/1aa/2Zq/3/b5qtGHvpLs2bZxe3s9Xzaw7LQkH0ny1fZcLk7ypPFeI81eBoeG7YfA/UmWJjkkyQ5j9NkPuJ7uw/M44ItJHt+WrQX+FHgscBRwUpJnDqz7RGAbupHMO4B/AV5LN6p4AfCOJL8/0P/FwFcH5l8GnAFsDywDPjyw7EdtG4+jGyV9Kskuo+q+HHgC8Jm2nWcDT241fDjJtuO8Lp8BVrbnfAIwZsi0EdKHgEOqajvgucBlVXUN8Abgu+2w1vYDq/0ZcCKwHTDWoawntv3Ob/s9JclGDzdV1R+2yX3aPs8cVetWwJeB84CdgL8CPj1q26+mey13AFa1OrWZMTg0VFX1C+D5QNF9qK9LsizJzgPd1gIfqKp724fRtcBL2vpfraofVecbdB9KLxhY917gxKq6l+6De0fgg1V1Z1VdBVwFPG2g/yHAOQPz36qqc9r5hE8C+wzU/rmqWlNVD7S6rgP2HVj3hqr6RFv3TGA34F1VdU9VnQf8hi5ENpBkd7qA+fvW95t0H7jjeQB4apJHVdXN7XlN5Oyq+nar+9fj9BnZ9zfogvSVG9nmZOwPbAu8p6p+U1UX0B2mfPVAny9W1SVVdR/waeDpU7BfTTODQ0NXVddU1euqagHwVGBX4AMDXX5aG95t88etD22UclE77HQ73Yhhx4G+Px84iXx3e7xlYPnddB9mJNkb+EVV3TSw/GcD078Cthk4RHRkksvaYZfbW+2D+x69H6pqzH2PsitwW1X9ctRz/h2tz6voRhc3t8M8fzBW3wE3bWT5WPvedSPrTMauwE1V9cCobc8fmB/9eo83ItMsZnBoWlXVD4DT6D6ER8xPkoH53YE1SR4JfAF4H7BzOxxzDjDYt4/Rh6nGleT36EZIbwKe0PZ95UPY96CbgR3aYagRu4/XuarOrar/AOwC/KDVBd0obsxVNrL/sfa9pk3/Enj0wLInbmRbg9YAuyUZ/FzZHfhpj21oM2BwaKiS/EGSNydZ0OZ3ozt0cdFAt52Av06yVZLDgT3pAmJr4JHAOuC+JIcAD+Uy2pew4WGqiTyG7gN4Xav7KDYMu01WVT8GVgDvTLJ1kucDLx2rb5Kdk7ysfdDfA9wFjIywbgEWJNl6E8oY2fcL6M4hfa61Xwa8PMmj0112e/So9W4Bfp+xXUwXPG9t/5cHtOd1xibUp1nM4NCw3Ul3EvnidpXPRXR/ub95oM/FwCLgVrqTpa+oqp9X1Z3AXwNnAbfRnfRdtilFJHkcXSB9ZzL9q+pq4J+A79J9WO4NfHtT9j2OP6N7XdbTXRBw+jj9HkH3Wq1pff8I+K9t2QV053B+luTWHvv+Gd3ruYbuPMMb2kgQ4CS6czO3AEvb8kHHA0vb4bsNzotU1W/oLjY4hO7/8qPAkQPb1sNE/CEnzaQkrwP+oqqeP+T9vJIukKbiJLA0pzni0FxxO91f05Ieoi1nugBpOrTLYyVNAQ9VSZJ68VCVJKmXoR6qSnIj3VU19wP3VdXidiuJM4GFwI3AK6vqtnYd/wfprrX/FfC6qvq3tp0lwP9om313VS2daL877rhjLVy4cMqfjyQ9nK1cufLWqpq3sX7TcY7jj6tq8FLBY4Hzq+o9SY5t839HdwnfovZvP+BjwH4taI4DFtNdV78yybKqum28HS5cuJAVK1YM59lI0sNUkjHvYDDaTByqOpTu+nDa42ED7ae3exJdBGzfbij3ImB5Va1vYbEcOHi6i5YkdYYdHAWcl2RlkmNa285VdTNAe9yptc9nw3vsrG5t47VvIMkxSVYkWbFu3bopfhqSpBHDPlT1vKpak2QnYHmSib5BOtY9gGqC9g0bqk4BTgFYvHixl4pJ0pAMdcRRVWva41rgS3S3pL5l5DcN2uPa1n013W2pRyyguyXCeO2SpBkwtOBI8pgk241M092c7kq6ew2N/GjNEuDsNr0MODKd/YE72qGsc4GDkuzQfgTooNYmSZoBwzxUtTPwpXa37C2Bz1TV15JcCpyV5GjgJ8Dhrf85dJfirqK7HPcogKpan+QE4NLW711VtX6IdUuSJvCw/Ob44sWLy8txJamfJCuravHG+vnNcUlSLwaHJKkX7447jme9Zbzf1dFctvIfj5zpEqQZ54hDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi9DD44kWyT5XpKvtPk9klyc5LokZybZurU/ss2vassXDmzjba392iQvGnbNkqTxTceI42+Aawbm3wucVFWLgNuAo1v70cBtVfVk4KTWjyR7AUcATwEOBj6aZItpqFuSNIahBkeSBcBLgP/d5gO8EPh867IUOKxNH9rmacsPbP0PBc6oqnuq6gZgFbDvMOuWJI1v2COODwBvBR5o808Abq+q+9r8amB+m54P3ATQlt/R+v+2fYx1fivJMUlWJFmxbt26qX4ekqRmaMGR5E+BtVW1crB5jK61kWUTrfNgQ9UpVbW4qhbPmzevd72SpMnZcojbfh7wsiQvBrYBHks3Atk+yZZtVLEAWNP6rwZ2A1Yn2RJ4HLB+oH3E4DqSpGk2tBFHVb2tqhZU1UK6k9sXVNVrgAuBV7RuS4Cz2/SyNk9bfkFVVWs/ol11tQewCLhkWHVLkiY2zBHHeP4OOCPJu4HvAR9v7R8HPplkFd1I4wiAqroqyVnA1cB9wBur6v7pL1uSBNMUHFX1deDrbfp6xrgqqqp+DRw+zvonAicOr0JJ0mT5zXFJUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpl6EFR5JtklyS5PtJrkryzta+R5KLk1yX5MwkW7f2R7b5VW35woFtva21X5vkRcOqWZK0ccMccdwDvLCq9gGeDhycZH/gvcBJVbUIuA04uvU/Gritqp4MnNT6kWQv4AjgKcDBwEeTbDHEuiVJExhacFTnrja7VftXwAuBz7f2pcBhbfrQNk9bfmCStPYzquqeqroBWAXsO6y6JUkTG+o5jiRbJLkMWAssB34E3F5V97Uuq4H5bXo+cBNAW34H8ITB9jHWGdzXMUlWJFmxbt26YTwdSRJDDo6qur+qng4soBsl7DlWt/aYcZaN1z56X6dU1eKqWjxv3rxNLVmStBHTclVVVd0OfB3YH9g+yZZt0QJgTZteDewG0JY/Dlg/2D7GOpKkaTbMq6rmJdm+TT8K+BPgGuBC4BWt2xLg7Da9rM3Tll9QVdXaj2hXXe0BLAIuGVbdkqSJbbnxLptsF2BpuwLqEcBZVfWVJFcDZyR5N/A94OOt/8eBTyZZRTfSOAKgqq5KchZwNXAf8Maqun+IdUuSJjC04Kiqy4FnjNF+PWNcFVVVvwYOH2dbJwInTnWNkqT+/Oa4JKkXg0OS1IvBIUnqZVLBkeT8ybRJkh7+Jjw5nmQb4NHAjkl24MEv4z0W2HXItUmSZqGNXVX1euBv6UJiJQ8Gxy+AjwyxLknSLDVhcFTVB4EPJvmrqjp5mmqSJM1ik/oeR1WdnOS5wMLBdarq9CHVJUmapSYVHEk+CTwJuAwY+dZ2AQaHJM0xk/3m+GJgr3bvKEnSHDbZ73FcCTxxmIVIkjYPkx1x7AhcneQSup+EBaCqXjaUqiRJs9Zkg+P4YRYhSdp8TPaqqm8MuxBJ0uZhsldV3cmDP9e6NbAV8MuqeuywCpMkzU6THXFsNzif5DDG+E0NSdLD3ybdHbeq/g/wwimuRZK0GZjsoaqXD8w+gu57HX6nQ5LmoMleVfXSgen7gBuBQ6e8GknSrDfZcxxHDbsQSdLmYbI/5LQgyZeSrE1yS5IvJFkw7OIkSbPPZE+OfwJYRve7HPOBL7c2SdIcM9ngmFdVn6iq+9q/04B5Q6xLkjRLTTY4bk3y2iRbtH+vBX4+zMIkSbPTZIPjz4FXAj8DbgZeAXjCXJLmoMlejnsCsKSqbgNI8njgfXSBIkmaQyY74njaSGgAVNV64BnDKUmSNJtNNjgekWSHkZk24pjsaEWS9DAy2Q//fwK+k+TzdLcaeSVw4tCqkiTNWpP95vjpSVbQ3dgwwMur6uqhViZJmpUmfbipBYVhIUlz3CbdVl2SNHcZHJKkXgwOSVIvQwuOJLsluTDJNUmuSvI3rf3xSZYnua497tDak+RDSVYluTzJMwe2taT1vy7JkmHVLEnauGGOOO4D3lxVewL7A29MshdwLHB+VS0Czm/zAIcAi9q/Y4CPwW+/M3IcsB/d75wfN/idEknS9BpacFTVzVX1b236TuAauluyHwosbd2WAoe16UOB06tzEbB9kl2AFwHLq2p9+/b6cuDgYdUtSZrYtJzjSLKQ7hYlFwM7V9XN0IULsFPrNh+4aWC11a1tvPbR+zgmyYokK9atWzfVT0GS1Aw9OJJsC3wB+Nuq+sVEXcdoqwnaN2yoOqWqFlfV4nnz/KkQSRqWoQZHkq3oQuPTVfXF1nxLOwRFe1zb2lcDuw2svgBYM0G7JGkGDPOqqgAfB66pqvcPLFoGjFwZtQQ4e6D9yHZ11f7AHe1Q1rnAQUl2aCfFD2ptkqQZMMw73D4P+M/AFUkua21vB94DnJXkaOAnwOFt2TnAi4FVwK9oPxRVVeuTnABc2vq9q93WXZI0A4YWHFX1LcY+PwFw4Bj9C3jjONs6FTh16qqTJG0qvzkuSerF4JAk9WJwSJJ6MTgkSb0YHJKkXgwOSVIvBockqReDQ5LUi8EhSerF4JAk9WJwSJJ6MTgkSb0YHJKkXgwOSVIvBockqReDQ5LUi8EhSerF4JAk9WJwSJJ6MTgkSb0YHJKkXgwOSVIvBockqReDQ5LUi8EhSerF4JAk9WJwSJJ6MTgkSb0YHJKkXgwOSVIvBockqReDQ5LUi8EhSerF4JAk9TK04EhyapK1Sa4caHt8kuVJrmuPO7T2JPlQklVJLk/yzIF1lrT+1yVZMqx6JUmTM8wRx2nAwaPajgXOr6pFwPltHuAQYFH7dwzwMeiCBjgO2A/YFzhuJGwkSTNjaMFRVd8E1o9qPhRY2qaXAocNtJ9enYuA7ZPsArwIWF5V66vqNmA5vxtGkqRpNN3nOHauqpsB2uNOrX0+cNNAv9Wtbbz235HkmCQrkqxYt27dlBcuSerMlpPjGaOtJmj/3caqU6pqcVUtnjdv3pQWJ0l60HQHxy3tEBTtcW1rXw3sNtBvAbBmgnZJ0gyZ7uBYBoxcGbUEOHug/ch2ddX+wB3tUNa5wEFJdmgnxQ9qbZKkGbLlsDac5LPAAcCOSVbTXR31HuCsJEcDPwEOb93PAV4MrAJ+BRwFUFXrk5wAXNr6vauqRp9wlyRNo6EFR1W9epxFB47Rt4A3jrOdU4FTp7A0SdJDMFtOjkuSNhMGhySpF4NDktSLwSFJ6sXgkCT1MrSrqiQNx0/etfdMl6BZaPd3XDFt+3LEIUnqxeCQJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9bDbBkeTgJNcmWZXk2JmuR5Lmqs0iOJJsAXwEOATYC3h1kr1mtipJmps2i+AA9gVWVdX1VfUb4Azg0BmuSZLmpC1nuoBJmg/cNDC/GthvsEOSY4Bj2uxdSa6dptrmgh2BW2e6iNkg71sy0yVoQ743RxyXqdjK702m0+YSHGO9IrXBTNUpwCnTU87ckmRFVS2e6Tqk0XxvzozN5VDVamC3gfkFwJoZqkWS5rTNJTguBRYl2SPJ1sARwLIZrkmS5qTN4lBVVd2X5E3AucAWwKlVddUMlzWXeAhQs5XvzRmQqtp4L0mSms3lUJUkaZYwOCRJvRgcc1iShUmunOk6JG1eDA5JUi8Gh7ZI8i9JrkpyXpJHJfnLJJcm+X6SLyR5NECS05J8LMmFSa5P8kdJTk1yTZLTZvh5aDOX5DFJvtred1cmeVWSG5O8N8kl7d+TW9+XJrk4yfeS/L8kO7f245Msbe/lG5O8PMn/THJFkq8l2Wpmn+XDg8GhRcBHquopwO3AfwK+WFXPrqp9gGuAowf67wC8EPhvwJeBk4CnAHsnefq0Vq6Hm4OBNVW1T1U9Ffhaa/9FVe0LfBj4QGv7FrB/VT2D7t51bx3YzpOAl9Ddz+5TwIVVtTdwd2vXQ2Rw6IaquqxNrwQWAk9N8q9JrgBeQxcMI75c3TXcVwC3VNUVVfUAcFVbV9pUVwB/0kYYL6iqO1r7Zwcen9OmFwDntvfoW9jwPfp/q+retr0teDCArsD36JQwOHTPwPT9dF8KPQ14U/sr7Z3ANmP0f2DUug+wmXyhVLNTVf0QeBbdB/w/JHnHyKLBbu3xZODD7T36esZ4j7Y/aO6tB7+s5nt0ihgcGst2wM3tePBrZroYzQ1JdgV+VVWfAt4HPLMtetXA43fb9OOAn7Zpb1k8zUxfjeXvgYuBH9P99bfdzJajOWJv4B+TPADcC/wX4PPAI5NcTPeH7qtb3+OBzyX5KXARsMf0lzt3ecsRSbNWkhuBxVXlb27MIh6qkiT14ohDktSLIw5JUi8GhySpF4NDktSLwSE9BEnu6tH3+CT/fVjbl6aLwSFJ6sXgkKbYeHdubfZJckGS65L85cA6b2l3JL48yTtnoGxp0gwOaepNdOfWp9HdofU5wDuS7JrkILq7FO8LPB14VpI/nOaapUnzliPS1FsAnJlkF2Br4IaBZWdX1d3A3UkupAuL5wMHAd9rfbalC5JvTl/J0uQZHNLUOxl4f1UtS3IA3X2VRoz+xm0BAf6hqv7X9JQnPTQeqpKm3kR3bj00yTZJngAcAFwKnAv8eZJtAZLMT7LTdBUr9eWIQ3poHp1k9cD8+5n4zq2XAF8FdgdOqKo1wJokewLfTQJwF/BaYO3wy5f6815VkqRePFQlSerF4JAk9WJwSJJ6MTgkSb0YHJKkXgwOSVIvBockqZf/Dz20qUc1AmO8AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.countplot(x='Label',data=df).set_title('Spam/ham distribution')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clearly, it can be seen, that we face sample imbalance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also, it would be reasonable to check, whether spam/ham labelling correlates with the length of messages." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LabelTextlength
0hamGo until jurong point, crazy.. Available only ...111
1hamOk lar... Joking wif u oni...29
2spamFree entry in 2 a wkly comp to win FA Cup fina...155
3hamU dun say so early hor... U c already then say...49
4hamNah I don't think he goes to usf, he lives aro...61
\n", + "
" + ], + "text/plain": [ + " Label Text length\n", + "0 ham Go until jurong point, crazy.. Available only ... 111\n", + "1 ham Ok lar... Joking wif u oni... 29\n", + "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 155\n", + "3 ham U dun say so early hor... U c already then say... 49\n", + "4 ham Nah I don't think he goes to usf, he lives aro... 61" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = df\n", + "df1['length'] = df1['Text'].apply(len)\n", + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([,\n", + " ],\n", + " dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA5IAAAF8CAYAAAC9oITJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3X20bWddH/rvjxxBQSWEHBCS4EGJqFVBegxcva2UGEiIw1CGKFRLoNj0DqG1V++VoB0XtWoPvbfyMqzcRhIICoYX2xKbKOai1KEV5ISXCETIIQZyCCSHJkQsvhD43T/WPM3OyT45+9kva+219+czxhl7rWfOtddvz7HWeeZ3PnM+s7o7AAAAsFb3W3QBAAAALBdBEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAEAABgiSMIWqaqbqup7Fl0HAABsNkESAACAIYIkAAAAQwRJ2FqPr6rrqurOqnpTVX15VT2kqv5LVR2pqjumx6cffUFVvbOqfr6q/ltV/WVV/VZVPbSq3lBVf1FV76mqfYv7kwBg+VTVi6vqk1X1uar6SFWdXVU/U1Vvnfroz1XVe6vqcStec3FVfWxa9uGq+ocrlj2vqv6oql5eVZ+tqhur6jun9pur6raqunAxfy1sPUESttYPJDk3yaOTfFuS52X2vXttkq9N8qgkf5Xkl4953bOT/OMkpyX5+iR/PL3mlCTXJ3np1pcOADtDVT02yYuSfEd3f1WSpyW5aVp8QZK3ZNbHvjHJf66qL5uWfSzJ30vy4CQ/m+TXq+oRK371E5Ncl+Sh02uvSPIdSR6T5IeT/HJVfeXW/WWwOIIkbK1Xdfct3X17kt9K8vju/u/d/Zvd/fnu/lySX0jy3ce87rXd/bHuvjPJbyf5WHf/f919V2ad3bfP9a8AgOX2xSQPSPLNVfVl3X1Td39sWnZtd7+1u7+Q5JeSfHmSJyVJd79l6se/1N1vSnJDkrNW/N4/7+7XdvcXk7wpyRlJfq67/6a7fzfJ32YWKmHHESRha316xePPJ/nKqnpgVf2Hqvp4Vf1Fkj9IcnJVnbRi3VtXPP6rVZ47ugkAa9Tdh5L8yyQ/k+S2qrqiqh45Lb55xXpfSnI4ySOTpKqeW1Xvn05d/WySb0ly6opffWz/nO7WZ7MrCJIwfz+R5LFJntjdX53k70/ttbiSAGBn6+43dvf/mtmlJZ3kZdOiM46uU1X3S3J6kluq6muT/Gpmp8Q+tLtPTvLB6K8hiSAJi/BVmR2h/GxVnRLXOwLAlqqqx1bVU6rqAUn+OrN++IvT4r9bVc+sqj2ZjVr+TZJ3JXlQZoHzyPQ7np/ZiCQQQRIW4RVJviLJZzLrqH5nseUAwI73gCQHMut7P53kYUl+alr2tiQ/mOSOzCa6e2Z3f6G7P5zk32U24d2tSb41yR/NuW7Ytqq7F10DAADMXVX9TJLHdPcPL7oWWDZGJAEAABgiSAIAADDEqa0AAAAMMSIJAADAEEESAACAIXsWXcB9OfXUU3vfvn2LLgOATXbttdd+prv3LroO1k8fDbDzjPTP2zpI7tu3LwcPHlx0GQBssqr6+KJrYGP00QA7z0j/7NRWAAAAhgiSAAAADBEkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAEAABgiSAIAADBEkAQAAGDInkUXsEj7Lr7quMtuOnD+HCsBAIDtw34yJ2JEEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAFgh6mqy6rqtqr64CrL/o+q6qo6dXpeVfWqqjpUVddV1RPmXzEAy0aQBICd53VJzj22sarOSHJOkk+saD4vyZnTv4uSvHoO9QGw5E4YJDfrqGZVXVhVN0z/LtzcPwMAOKq7/yDJ7assenmSn0zSK9ouSPL6nnlXkpOr6hFzKBOAJbaWEcnXZYNHNavqlCQvTfLEJGcleWlVPWQjhQMAa1dV35fkk939gWMWnZbk5hXPD09tAHBcJwySm3RU82lJrunu27v7jiTXZJVwCgBsvqp6YJKfTvJ/rbZ4lbZepS1VdVFVHayqg0eOHNnMEgFYMuu6RnIdRzUd7QSAxfn6JI9O8oGquinJ6UneW1Vfk1mffMaKdU9Pcstqv6S7L+nu/d29f+/evVtcMgDb2Z7RF6w4qvnU1Rav0tb30b7a778os9Ni86hHPWq0PADgGN39p0kedvT5FCb3d/dnqurKJC+qqisyuwTlzu7+1GIqBWBZrGdEcj1HNR3tBIA5qarfSPLHSR5bVYer6gX3sfrVSW5McijJryb50TmUCMCSGx6RXM9Rzap6e5JfXDHBzlOTvGTD1QMA99LdzznB8n0rHneSF251TQDsLGu5/ceGj2p29+1J/nWS90z/fm5qAwAAYMmccERys45qdvdlSS4brA8AAIBtZl2ztgIAALB7CZIAAAAMESQBAAAYIkgCAAAwRJAEAABgiCAJAADAEEESAACAIYIkAAAAQwRJAAAAhgiSAAAADBEkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABDBEkAAACGCJIAAAAMESQBAAAYIkgCAAAwRJAEAABgiCAJAADAEEESAACAIYIkAAAAQwRJANhhquqyqrqtqj64ou3/rqo/q6rrquo/VdXJK5a9pKoOVdVHquppi6kagGUiSALAzvO6JOce03ZNkm/p7m9L8tEkL0mSqvrmJM9O8nem1/xKVZ00v1IBWEYnDJKbdVSzqs6d2g5V1cWb/6cAAEnS3X+Q5PZj2n63u++anr4ryenT4wuSXNHdf9Pdf57kUJKz5lYsAEtpLSOSr8sGj2pORzb/fZLzknxzkudM6wIA8/dPkvz29Pi0JDevWHZ4agOA4zphkNyko5pnJTnU3Td2998muWJaFwCYo6r66SR3JXnD0aZVVuvjvPaiqjpYVQePHDmyVSUCsAQ24xrJtRzVXPPRTp0UAGyNqrowyfcm+aHuPhoWDyc5Y8Vqpye5ZbXXd/cl3b2/u/fv3bt3a4sFYFvbUJAcOKq55qOdOikA2HxVdW6SFyf5vu7+/IpFVyZ5dlU9oKoeneTMJH+yiBoBWB571vvCFUc1z17jUc01He0EADamqn4jyZOTnFpVh5O8NLP5DB6Q5JqqSpJ3dff/1t0fqqo3J/lwZgeHX9jdX1xM5QAsi3UFyRVHNb97laOab6yqX0ryyNx9VLOSnDkd6fxkZhPy/KONFA4ArK67n7NK86X3sf4vJPmFrasIgJ3mhEFys45qVtWLkrw9yUlJLuvuD23B3wMAAMAWO2GQ3Kyjmt19dZKrh6oDAABg29mMWVsBAADYRQRJAAAAhgiSAAAADBEkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABDBEkAAACGCJIAAAAMESQBAAAYIkgCAAAwZM+iCwAAAOZv38VXLboElpgRSQAAAIYIkgAAAAwRJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABDBEkAAACGCJIAAAAMESQBYIepqsuq6raq+uCKtlOq6pqqumH6+ZCpvarqVVV1qKquq6onLK5yAJbFCYPkZnVGVXXhtP4NVXXh1vw5AECS1yU595i2i5O8o7vPTPKO6XmSnJfkzOnfRUlePacaAVhiaxmRfF022BlV1SlJXprkiUnOSvLSo+ETANhc3f0HSW4/pvmCJJdPjy9P8owV7a/vmXclObmqHjGfSgFYVicMkpvUGT0tyTXdfXt335Hkmtw7nAIAW+fh3f2pJJl+PmxqPy3JzSvWOzy1AcBxrfcaydHOSCcFANtTrdLWq65YdVFVHayqg0eOHNnisgDYzjZ7sp3jdUY6KQBYrFuPnrI6/bxtaj+c5IwV652e5JbVfkF3X9Ld+7t7/969e7e0WAC2t/UGydHOSCcFAIt1ZZKjk91dmORtK9qfO02Y96Qkdx496wgAjme9QXK0M3p7kqdW1UOmSXaeOrUBAJusqn4jyR8neWxVHa6qFyQ5kOScqrohyTnT8yS5OsmNSQ4l+dUkP7qAkgFYMntOtMLUGT05yalVdTiz2VcPJHnz1DF9IsmzptWvTvL0zDqjzyd5fpJ09+1V9a+TvGda7+e6+9gJfLaVfRdfddxlNx04f46VAMCY7n7OcRadvcq6neSFW1sRADvNCYPkZnVG3X1ZksuGqgMAAGDb2ezJdgAAANjhBEkAAACGCJIAAAAMESQBAAAYIkgCAAAwRJAEAABgiCAJAADAEEESAACAIYIkAAAAQwRJAAAAhgiSAAAADBEkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABDBEkAAACGCJIAAAAMESQBAAAYIkgCAAAwRJAEAABgiCAJAADAEEESAACAIYIkAAAAQwRJANhFqup/r6oPVdUHq+o3qurLq+rRVfXuqrqhqt5UVfdfdJ0AbG8bCpIjnVFVPWB6fmhavm8z/gAAYG2q6rQk/yLJ/u7+liQnJXl2kpcleXl3n5nkjiQvWFyVACyDdQfJdXRGL0hyR3c/JsnLp/UAgPnak+QrqmpPkgcm+VSSpyR567T88iTPWFBtACyJjZ7aOtIZXTA9z7T87KqqDb4/ALBG3f3JJP9Pkk9k1mffmeTaJJ/t7rum1Q4nOW2111fVRVV1sKoOHjlyZB4lA7BNrTtIrqMzOi3JzdNr75rWf+h63x8AGFNVD8nswO6jkzwyyYOSnLfKqr3a67v7ku7e39379+7du3WFArDtbeTU1tHOaLXRx3t1VI52AsCW+Z4kf97dR7r7C0n+Y5LvTHLydHZRkpye5JZFFQjActjIqa2jndHhJGckybT8wUluP/aXOtoJAFvmE0meVFUPnC4vOTvJh5P8fpLvn9a5MMnbFlQfAEtiI0FytDO6cnqeafnvdfeqp84AAJuvu9+d2TwF703yp5ntB1yS5MVJfryqDmV22cmlCysSgKWw58SrrK67311VRzuju5K8L7PO6KokV1TVz09tRzujS5P82tRJ3Z7ZDK8AwBx190uTvPSY5huTnLWAcgBYUusOkslYZ9Tdf53kWRt5PwAAABZvo7f/AAAAYJcRJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABDBEkAAACGCJIAAAAMESQBAAAYIkgCAAAwRJAEAABgiCAJAADAkD2LLgAAAFge+y6+6rjLbjpw/hwrYZGMSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABDBEkAAACGCJIAAAAMESQBAAAYIkgCAAAwRJAEAABgiCAJAADAEEESAACAIYIkAAAAQwRJAAAAhgiSAAAADBEkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgB2kao6uareWlV/VlXXV9X/UlWnVNU1VXXD9PMhi64TgO1tQ0FypDOqmVdV1aGquq6qnrA5fwIAMOCVSX6nu78xyeOSXJ/k4iTv6O4zk7xjeg4Ax7XREcmRzui8JGdO/y5K8uoNvjcAMKCqvjrJ309yaZJ0999292eTXJDk8mm1y5M8YzEVArAs1h0k19EZXZDk9T3zriQnV9Uj1l05ADDq65IcSfLaqnpfVb2mqh6U5OHd/akkmX4+bLUXV9VFVXWwqg4eOXJkflUDsO1sZERytDM6LcnNK15/eGq7B50UAGyZPUmekOTV3f3tSf5HBk5j7e5Lunt/d+/fu3fvVtUIwBLYs8HXPiHJP+/ud1fVK3PfnVGt0tb3aui+JMklSbJ///57LQcA1u1wksPd/e7p+Vsz67tvrapHdPenprOFbltYhcC67Lv4qlXbbzpw/pwrYbfYSJAc7YwOJzljxetPT3LLBt5/YY73RU18WQHYvrr701V1c1U9trs/kuTsJB+e/l2Y5MD0820LLBOAJbDuU1u7+9NJbq6qx05NRzujKzPrhJJ7dkZXJnnuNHvrk5LcefQUWABgbv55kjdU1XVJHp/kFzMLkOdU1Q1JzpmeA8BxbWREMrm7M7p/khuTPD+zcPrmqnpBkk8keda07tVJnp7kUJLPT+sCAHPU3e9Psn+VRWfPuxYAlteGguRIZ9TdneSFG3k/AAAAFm+j95EEAABglxEkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABD9iy6AAAAYGvsu/iqRZfADmVEEgAAgCGCJAAAAEMESQAAAIa4RhIAAJaA6x3ZToxIAgAAMESQBAAAYIggCQAAwBBBEgAAgCGCJAAAAEPM2goAANuEmVlZFkYkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgAAgCFmbd1k9zXT1k0Hzp9jJQAAAFvDiCQA7DJVdVJVva+q/sv0/NFV9e6quqGq3lRV9190jQBsbxsOkmvtjKrqAdPzQ9PyfRt9bwBgXX4syfUrnr8sycu7+8wkdyR5wUKqAmBpbMaI5Fo7oxckuaO7H5Pk5dN6AMAcVdXpSc5P8prpeSV5SpK3TqtcnuQZi6kOgGWxoSA52BldMD3PtPzsaX0AYH5ekeQnk3xpev7QJJ/t7rum54eTnLaIwgBYHhsdkRzpjE5LcnOSTMvvnNYHAOagqr43yW3dfe3K5lVW7eO8/qKqOlhVB48cObIlNQKwHNYdJNfRGa2po9JJAcCW+a4k31dVNyW5IrOziF6R5OSqOjqT++lJblntxd19SXfv7+79e/funUe9AGxTGxmRHO2MDic5I0mm5Q9Ocvuxv1QnBQBbo7tf0t2nd/e+JM9O8nvd/UNJfj/J90+rXZjkbQsqEYAlse4guY7O6Mrpeablv9fdq546AwDM1YuT/HhVHcrsspNLF1wPANvcnhOvMuzFSa6oqp9P8r7c3RldmuTXpk7q9szCJwCwAN39ziTvnB7fmOSsRdYDwHLZlCC5ls6ou/86ybM24/0AAABYnM24jyQAAAC7iCAJAADAEEESAACAIYIkAAAAQwRJAAAAhgiSAAAADBEkAQAAGCJIAgAAMESQBAAAYIggCQAAwBBBEgAAgCGCJAAAAEMESQAAAIYIkgAAAAwRJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABD9iy6ANZv38VXHXfZTQfOn2MlAADAbmJEEgAAgCFGJAEAYI7u66wyWBZGJAEAABgiSAIAADDEqa1ztJ7TGEyaAwAAbDdGJAEAABgiSAIAADBEkAQAAGCIIAkAAMAQQRIAAIAhgiQAAABDBEkAAACGuI/kNreee08CwGqq6owkr0/yNUm+lOSS7n5lVZ2S5E1J9iW5KckPdPcdi6oTgO1v3SOSVXVGVf1+VV1fVR+qqh+b2k+pqmuq6obp50Om9qqqV1XVoaq6rqqesFl/BACwJncl+Ynu/qYkT0rywqr65iQXJ3lHd5+Z5B3TcwA4ro2MSB7tjN5bVV+V5NqquibJ8zLrjA5U1cWZdUYvTnJekjOnf09M8urpJwAwB939qSSfmh5/rqquT3JakguSPHla7fIk78ys7wZy/DPEbjpw/pwrge1j3SOS3f2p7n7v9PhzSVZ2RpdPq12e5BnT4wuSvL5n3pXk5Kp6xLorBwDWrar2Jfn2JO9O8vApZB4Nmw9bXGUALINNmWxnjZ3RaUluXvGyw1Pbsb/roqo6WFUHjxw5shnlAQArVNVXJvnNJP+yu/9i4HX6aACSbEKQHOiMapW2vldD9yXdvb+79+/du3ej5QEAK1TVl2XWb7+hu//j1Hzr0bOEpp+3rfZafTQAR20oSA52RoeTnLHi5acnuWUj7w8ArF1VVZJLk1zf3b+0YtGVSS6cHl+Y5G3zrg2A5bKRWVtHO6Mrkzx3mr31SUnuPHoKLAAwF9+V5B8neUpVvX/69/QkB5KcU1U3JDlneg4Ax7WRWVuPdkZ/WlXvn9p+KrPO581V9YIkn0jyrGnZ1UmenuRQks8nef4G3hsAGNTdf5jVLzVJkrPnWQsAy23dQXK0M+ruTvLC9b4fAAAA28OmzNoKAADA7iFIAgAAMGQj10gujX0XX7XoEgAAAHYMI5IAAAAMESQBAAAYsitObQUAYHe4r0uabjpw/tK+F2w3RiQBAAAYIkgCAAAwRJAEAABgiGskAQDYdpb9+kO3n2OnMyIJAADAEEESAACAIYIkAAAAQ1wjCQDArueaRhhjRBIAAIAhgiQAAABDBEkAAACGCJIAAAAMMdkOAMAuc18Ty9x04Pw5VjJfJtTZerv1s7UbGZEEAABgiBFJAIBNdrxRmXmPyGz2CJzRJuAoI5IAAAAMESQBAAAYIkgCAAAwxDWSAAAszHqu4zT7KiyeILlDuRgeAADYKoIkAABbyggi7DyukQQAAGCIEUkAYMfbyZd8GO1jWazns7rs38+dzIgkAAAAQ4xIAgAcx04eydxsRkaZN9/PxRIkdyFfOgAAYCOc2goAAMCQuY9IVtW5SV6Z5KQkr+nuA/OugXFGMQF2Nv3zuPWcyrkM/alTVIG1mGuQrKqTkvz7JOckOZzkPVV1ZXd/eJ51cHzz7BSXoTMF2A30zwCMmveI5FlJDnX3jUlSVVckuSCJjop7MD00wFwtpH/e7JGv9fYD22UEbrvUAdvJvL8Xx3u/7bSfuV1qnHeQPC3JzSueH07yxDnXwBzN88u/Fe+12SOq661xO/3ndTxGmGGp6Z8BGDLvIFmrtPU9Vqi6KMlF09O/rKqPbPA9T03ymQ3+jt3CtjpGvey4i+5zW93H6za7jmVwar3M52rAbvkefu2iC+AeTtg/J1vSR2+qJfi/crd8v7eSbbg5dvx2nMO+2Lbbhpv0N6+5f553kDyc5IwVz09PcsvKFbr7kiSXbNYbVtXB7t6/Wb9vJ7Ot1s62WjvbaoztxYKcsH9ONr+P3m18vzfONtwctuPG2Ybzv/3He5KcWVWPrqr7J3l2kivnXAMAcE/6ZwCGzHVEsrvvqqoXJXl7ZtOLX9bdH5pnDQDAPemfARg19/tIdvfVSa6e41s6BWftbKu1s63WzrYaY3uxEAvon3cj3++Nsw03h+24cbt+G1b3va6lBwAAgOOa9zWSAAAALDlBEgAAgCFzv0ZyK1XVNya5ILMbK3dmU5df2d3XL7QwAACAHWTHXCNZVS9O8pwkV2R2P6xkdh+sZye5orsPLKq27ayqHp4Vwbu7b11wSdtaVZ2SpLv7jkXXsp35XI3xuQKA7c/+zT3tpCD50SR/p7u/cEz7/ZN8qLvPXExl21NVPT7J/5vkwUk+OTWfnuSzSX60u9+7qNq2m6p6VJJ/m+TszLZPJfnqJL+X5OLuvmlx1W0vPldr53MFO1tVPTjJS5I8I8neqfm2JG9LcqC7P7uo2paNnfeNq6pKclbuedben/ROCQJbzP7N6nbSqa1fSvLIJB8/pv0R0zLu6XVJ/ll3v3tlY1U9KclrkzxuEUVtU29K8ookP9TdX0ySqjopybMyGwF/0gJr225eF5+rtfK5gp3tzZkdGHpyd386Sarqa5JcmOQtSc5ZYG1L4Xg771W1q3feR1XVU5P8SpIbcs8Q9Jiq+tHu/t2FFbc8Xhf7N/eyk0Ykz03yy5l9SW6emh+V5DFJXtTdv7Oo2rajqrrheKO0VXWoux8z75q2qxNsq+Mu2418rtbO5wp2tqr6SHc/dnQZd6uq9+f4O+//obt35c77qKq6Psl5x57pUlWPTnJ1d3/TQgpbIvZvVrdjRiS7+3eq6hty97B9ZXat5HuOHu3nHn67qq5K8vrcHbzPSPLcJEL3PV1bVb+S5PLcc1tdmOR9C6tqe/K5WjufK9jZPl5VP5nk8qOnYk6naD4vd3/nuW8POjZEJkl3v6uqHrSIgpbUntw9f8hKn0zyZXOuZVnZv1nFjhmRZFxVnZe7Z7k9Gryv7O6rF1rYNjNdZ/uCrLKtklza3X+zwPK2HZ+rtfG5gp2tqh6S5OLMvuMPz+y6tFsz+46/rLtvX2B5S6GqXpXk67P6zvufd/eLFlXbMqmqlyT5gcwum1i5HZ+d5M3d/W8WVdsysX9zb4IkAMAWq6q/l9lZU3/qmrS1s/O+Oarqm7L6dvzwQgtjqQmSu9SK2eQuSPKwqdlscquoqj2ZjRw9I/ec7extmY0cfeE+Xr6r+Fytnc8V7GxV9Sfdfdb0+EeSvDDJf07y1CS/5bZksDzs36zufosugIV5c5I7kvyD7n5odz80yT/IbBrjtyy0su3n15I8PsnPJnl6kvOnx49L8usLrGs78rlaO58r2NlWXnv2z5I8tbt/NrMg+UOLKWm5VNWDq+pAVV1fVf99+nf91HbyoutbFtOElEcfP7iqXlNV11XVG6frdjkx+zerMCK5S5lNbu1OsK0+2t3fMO+atiufq7XzuYKdrao+kOTJmR20f3t371+x7H3d/e2Lqm1ZVNXbM7uFyuXH3ELleUnO7m63UFmDqnpvdz9hevyaJJ9O8qtJnpnku7v7GYusbxnYv1mdEcnd6+NV9ZMrj0RV1cOr6sUxm9yx7qiqZ1XV//y+VNX9quoHMzs6xd18rtbO5wp2tgcnuTbJwSSnTAEoVfWVmV2jxont6+6XHQ2RSdLdn55OC37UAutaZvu7+19198e7++VJ9i26oCVh/2YVguTu9YNJHprkv1bVHVV1e5J3Jjkls5m9uNuzk3x/klur6qNVdUNmR/OeOS3jbj5Xa3f0c/Xp6XP10fhcwY7R3fu6++u6+9HTz6Nh6EtJ/uEia1sidt43x8Oq6ser6ieSfHVVrTyQIQusjf2bVTi1dRerqm9McnqSd3X3X65oP7e7d+09ce5LVT00syPJr+juH150PdtNVT3dFp1nAAACn0lEQVQxyZ91951V9cDMpr5/QpIPJfnF7r5zoQVuI9PtP56T2QQ7701yXpLvzGxbXWKyHWC3O+YWKkcnODl6C5UD3e3sjTWoqpce0/Qr3X1kGiX/t9393EXUtWzsN9+bILlLVdW/yGwGueszm/Djx7r7bdOy/3kuPUlVXblK81Myu24j3f19861o+6qqDyV5XHffVVWXJPkfSX4zydlT+zMXWuA2UlVvyOwm0V+R5M4kD0rynzLbVtXdFy6wPIBtraqe392vXXQdy852XBv7zavbs+gCWJh/muTvdvdfVtW+JG+tqn3d/cq4duNYpyf5cJLXZHaLhkryHUn+3SKL2qbu1913TY/3r/iP9Q+r6v2LKmqb+tbu/rbpNiCfTPLI7v5iVf16kg8suDaA7e5nkwhAG2c7ro395lUIkrvXSUeH5bv7pqp6cmZfiq/NLv5CHMf+JD+W5KeT/J/d/f6q+qvu/q8Lrms7+uCKo5sfqKr93X2wqr4hiVM17+l+0+mtD0rywMwm5rg9yQNyz9sGAOxKVXXd8RYlcduKNbIdN4X95lUIkrvXp6vq8d39/iSZjrB8b5LLknzrYkvbXrr7S0leXlVvmX7eGt+d4/mRJK+sqn+V5DNJ/riqbs5sUoQfWWhl28+lSf4syUmZHaR4S1XdmORJSa5YZGEA28TDkzwt957JupL8t/mXs7Rsx42z37wK10juUlV1epK7Vk6pvWLZd3X3Hy2grKVQVecn+a7u/qlF17JdVdVXJfm6zAL34e6+dcElbUtV9cgk6e5bpptrf0+ST3T3nyy2MoDFq6pLk7y2u/9wlWVv7O5/tICylo7tuHH2m1cnSAIAADDEvWMAAAAYIkgCAAAwRJAEAABgiCAJAADAEEESAACAIf8/pMFfLPQ1CukAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df1.hist(column='length',by='Label',bins=50, figsize=(15,6))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It can be seen that most of spam messages are longer than 100 symbols, while hams tend to be shorter." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# replace 'ham' with 0 and 'spam' with 1 and divide evth into train and test split\n", + "df['Label'] = df['Label'].map({'ham': 0, 'spam': 1})\n", + "X_tr, X_t, y_tr, y_t = train_test_split(df['Text'],df['Label'],random_state=777)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quality Metric argumentation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Regarding the sample imbalance mentioned in the previous part, and our interest in identification of both classes, I would suggest using Precision, Recall, and F1-score, because accuracy will tend to show great results, when just predicting the majority class." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "def sms_preprocess(sms, lower_case = True, stem = True, stop_words = True, gram = 2):\n", + " if lower_case:\n", + " sms = sms.lower() # words are the same, don't mind if they are in capital letters\n", + " words = word_tokenize(sms)\n", + " words = [w for w in words if len(w) > 2]\n", + " if stop_words:\n", + " stopw = stopwords.words('english')\n", + " words = [w for w in words if word not in stopw]\n", + " if stem:\n", + " stemming = PorterStemmer()\n", + " words = [stemming.stem(w) for w in words] \n", + " return words\n", + "'''\n", + "vectorizer = CountVectorizer(stop_words='english')\n", + "X_tr_trans = vectorizer.fit_transform(X_tr).toarray()\n", + "X_t_trans = vectorizer.transform(X_t)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Naive Bayes Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Bag of words " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First of all, let us consider classifier using bag of words approach.\n", + "Calculate the probabilities of spam and ham in our dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$P(Spam) = \\frac{number\\:of\\:spam\\:messages}{overall\\:number\\:of\\:messages} $$\n", + "\n", + "$$P(Ham) = \\frac{number\\:of\\:ham\\:messages}{overall\\:number\\:of\\:messages}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.13711414213926776" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Probability of spam\n", + "Pspam = sum(y_tr) / len(y_tr)\n", + "Pspam" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8628858578607322" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Probability of ham\n", + "Pham = 1 - Pspam\n", + "Pham" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we divide our train sample in ham and spam ones in order to calculate the conditional probabilities. We are also taking into account the issue of potential new words(not present in training sample) appearance during tests and thus it may turn our calculations into zero. Therefore, we're taking logarithms and introducing adaptive smoothing(Laplace smoothing in particular with $\\alpha = 1$) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$log(P(ham | bodyText)) = log(P(ham)) + \\sum\\limits_{i=1}^n log(P(word_i | ham))$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$P(Word | Spam) = \\frac{total\\:number\\:of\\:occurences\\:of\\:w\\:in\\:spam\\:messages + 1}{total\\:number\\:of\\:words\\:in\\:spam\\:messages}$$\n", + "\n", + "$$P(Word | Ham) = \\frac{total\\:number\\:of\\:occurences\\:of\\:w\\:in\\:ham\\:messages + 1}{total\\:number\\:of\\:words\\:in\\:ham\\:messages}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "# tables for calculations\n", + "spam_table = pd.DataFrame(X_tr_trans[np.where(y_tr == 1)],columns = vectorizer.get_feature_names())\n", + "ham_table = pd.DataFrame(X_tr_trans[np.where(y_tr == 0)],columns = vectorizer.get_feature_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "# probabilities of word occurence, if it is a spam message\n", + "spam_freq = spam_table.sum(axis=0) + 1\n", + "spam_ovr = spam_table.sum().sum()\n", + "spam_prob = spam_freq/spam_ovr" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "# probabilities of word occurence, if it is a ham message\n", + "ham_freq = ham_table.sum(axis=0) + 1\n", + "ham_ovr = ham_table.sum().sum()\n", + "ham_prob = ham_freq/ham_ovr" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": {}, + "outputs": [], + "source": [ + "def proba(data):\n", + " prob_ham = np.log(Pham)\n", + " prob_spam = np.log(Pspam)\n", + " data = scipy.sparse.find(data)\n", + " for i in range(len(data[1])): # iterations over all words\n", + " prob_ham = prob_ham + np.log(ham_prob[data[1][i]]) * data[2][i]\n", + " prob_spam = prob_spam + np.log(spam_prob[data[1][i]]) * data[2][i]\n", + " if prob_ham >= prob_spam:\n", + " return 0\n", + " else:\n", + " return 1\n", + "def naive_bayes(dfrm):\n", + " result = []\n", + " for i in dfrm:\n", + " result.append(proba(i))\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 261, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.98 0.99 1219\n", + " 1 0.88 0.97 0.92 174\n", + "\n", + " micro avg 0.98 0.98 0.98 1393\n", + " macro avg 0.94 0.98 0.95 1393\n", + "weighted avg 0.98 0.98 0.98 1393\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_t,naive_bayes(X_t_trans)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Naive Bayes using sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# training\n", + "NB = MultinomialNB()\n", + "NB.fit(X_tr_trans, y_tr)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.99 0.99 0.99 1219\n", + " 1 0.95 0.95 0.95 174\n", + "\n", + " micro avg 0.99 0.99 0.99 1393\n", + " macro avg 0.97 0.97 0.97 1393\n", + "weighted avg 0.99 0.99 0.99 1393\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_t, NB.predict(X_t_trans)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It can be seen that our results are a little bit worse for spam class using F1-Score metric, but still these differences can be treated as insignificant." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "oldHeight": 223.333666, + "position": { + "height": "244.667px", + "left": "980.333px", + "right": "20px", + "top": "120px", + "width": "279.667px" + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "varInspector_section_display": "block", + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/NLP/nlp_hw_2/NLP_Task2.ipynb b/NLP/nlp_hw_2/NLP_Task2.ipynb new file mode 100644 index 0000000..0f20a4b --- /dev/null +++ b/NLP/nlp_hw_2/NLP_Task2.ipynb @@ -0,0 +1,967 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Homework №2: Text classification (task 2)\n", + "*Author: Solonin Maxim*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The task: \n", + "Classify news to one of 5 categories based on text\n", + "\n", + "1. Choose and argue your measure of a test's accuracy.\n", + "2. Build data processing and classification pipeline.\n", + "3. Tune your model." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression, SGDClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.feature_selection import SelectKBest, chi2\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.pipeline import Pipeline\n", + "from pprint import pprint\n", + "from time import time\n", + "from sklearn.neural_network import MLPClassifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Uploading the data from folders" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By the way the data link of University of Dublin is broken" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "data_folder = \"C:/Users/Maxim/Desktop/BBC-Dataset-News-Classification-master/dataset/data_files\"\n", + "folders = [\"business\",\"entertainment\",\"politics\",\"sport\",\"tech\"]\n", + "\n", + "os.chdir(data_folder)\n", + "\n", + "x = []\n", + "y = []\n", + "\n", + "for i in folders:\n", + " files = os.listdir(i)\n", + " for text_file in files:\n", + " file_path = i + \"/\" +text_file\n", + " with open(file_path) as f:\n", + " data = f.readlines()\n", + " data = ' '.join(data)\n", + " x.append(data)\n", + " y.append(i)\n", + " \n", + "data = {'news': x, 'type': y} \n", + "df = pd.DataFrame(data);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exploratory data analysis " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
newstype
0Ad sales boost Time Warner profit\\n \\n Quarter...business
1Dollar gains on Greenspan speech\\n \\n The doll...business
2Yukos unit buyer faces loan claim\\n \\n The own...business
3High fuel prices hit BA's profits\\n \\n British...business
4Pernod takeover talk lifts Domecq\\n \\n Shares ...business
\n", + "
" + ], + "text/plain": [ + " news type\n", + "0 Ad sales boost Time Warner profit\\n \\n Quarter... business\n", + "1 Dollar gains on Greenspan speech\\n \\n The doll... business\n", + "2 Yukos unit buyer faces loan claim\\n \\n The own... business\n", + "3 High fuel prices hit BA's profits\\n \\n British... business\n", + "4 Pernod takeover talk lifts Domecq\\n \\n Shares ... business" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head() # just understand how the data looks like" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The most basic thing about the news texts we can get is their distribution. It is a check for sample imbalance, so that we do not face any problems in future analysis." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlkAAAE6CAYAAAAlcEcuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAGQ1JREFUeJzt3XuwZWV95vHvw00jd7SxqG60wbQoUwpij6JMzUzAGPAS0IjCeGGU2ImSRMtMFDMzpSYmUaoMjo5jZETSGm9EJRC8khZ0tILSyE1tLTpEoQcGGoFuRA2Cv/ljryOH0yecffrst9fevb+fql17rXetc/bv9K6z+znv+653paqQJEnSaO3SdwGSJEk7I0OWJElSA4YsSZKkBgxZkiRJDRiyJEmSGjBkSZIkNWDIkiRJasCQJUmS1IAhS5IkqYHd+i4A4FGPelStXLmy7zIkSZIWdOWVV95eVcsWOm8sQtbKlStZv35932VIkiQtKMkPhznP4UJJkqQGDFmSJEkNGLIkSZIaMGRJkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBsZixfcdbeWZn+27hKZ+8I7n9l1CW2/dt+8K2nnrlr4rkCSNyFA9WUl+kOS6JFcnWd+1HZDkkiTXd8/7d+1J8p4kG5Ncm+Solj+AJEnSOFrMcOGvVdWRVbW62z8TWFdVq4B13T7ACcCq7rEGeP+oipUkSZoUS5mTdSKwttteC5w0q/3DNXA5sF+Sg5bwOpIkSRNn2JBVwJeSXJlkTdf26Kq6BaB7PrBrXw7cNOtrN3VtD5JkTZL1SdZv3rx5+6qXJEkaU8NOfD+mqm5OciBwSZLvPcS5maettmmoOgc4B2D16tXbHJckSZpkQ/VkVdXN3fNtwAXA04BbZ4YBu+fbutM3AQfP+vIVwM2jKliSJGkSLBiykuyZZO+ZbeDZwLeBi4DTutNOAy7sti8CXtFdZXg0sGVmWFGSJGlaDDNc+GjggiQz53+sqr6Q5Arg/CSnAzcCJ3fnfw54DrAR+AnwypFXLUmSNOYWDFlVdQNwxDztPwKOm6e9gDNGUp0kSdKE8rY6kiRJDRiyJEmSGjBkSZIkNWDIkiRJasCQJUmS1IAhS5IkqQFDliRJUgOGLEmSpAYMWZIkSQ0YsiRJkhowZEmSJDVgyJIkSWrAkCVJktSAIUuSJKkBQ5YkSVIDhixJkqQGDFmSJEkNGLIkSZIaMGRJkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDRiyJEmSGjBkSZIkNWDIkiRJamC3vguQND2etPZJfZfQ1HWnXdd3CZLGiD1ZkiRJDRiyJEmSGhg6ZCXZNclVSS7u9g9J8o0k1yf5ZJI9uvaHdfsbu+Mr25QuSZI0vhbTk/U6YMOs/XcCZ1fVKuBO4PSu/XTgzqr6VeDs7jxJkqSpMlTISrICeC7wwW4/wLHAp7pT1gInddsndvt0x4/rzpckSZoaw15d+G7gjcDe3f4jgbuq6r5ufxOwvNteDtwEUFX3JdnSnX/7SCqWJPViwxOe2HcJTT3xexsWPklahAV7spI8D7itqq6c3TzPqTXEsdnfd02S9UnWb968eahiJUmSJsUww4XHAL+Z5AfAJxgME74b2C/JTE/YCuDmbnsTcDBAd3xf4I6537Sqzqmq1VW1etmyZUv6ISRJksbNgiGrqt5cVSuqaiVwCvDlqnopcCnwou6004ALu+2Lun2641+uqm16siRJknZmS1kn603AG5JsZDDn6tyu/VzgkV37G4Azl1aiJEnS5FnUbXWq6jLgsm77BuBp85zzM+DkEdQmSZI0sbx3oSRJO7n3/e6X+y6hqTP+6ti+S5iXt9WRJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDRiyJEmSGjBkSZIkNWDIkiRJasCQJUmS1IAhS5IkqQFDliRJUgOGLEmSpAYMWZIkSQ0YsiRJkhowZEmSJDVgyJIkSWrAkCVJktSAIUuSJKkBQ5YkSVIDhixJkqQGDFmSJEkNGLIkSZIaMGRJkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDRiyJEmSGlgwZCV5eJJvJrkmyXeSvK1rPyTJN5Jcn+STSfbo2h/W7W/sjq9s+yNIkiSNn2F6sv4FOLaqjgCOBI5PcjTwTuDsqloF3Amc3p1/OnBnVf0qcHZ3niRJ0lRZMGTVwI+73d27RwHHAp/q2tcCJ3XbJ3b7dMePS5KRVSxJkjQBhpqTlWTXJFcDtwGXAP8E3FVV93WnbAKWd9vLgZsAuuNbgEfO8z3XJFmfZP3mzZuX9lNIkiSNmaFCVlXdX1VHAiuApwFPnO+07nm+XqvapqHqnKpaXVWrly1bNmy9kiRJE2FRVxdW1V3AZcDRwH5JdusOrQBu7rY3AQcDdMf3Be4YRbGSJEmTYpirC5cl2a/b/hXgWcAG4FLgRd1ppwEXdtsXdft0x79cVdv0ZEmSJO3Mdlv4FA4C1ibZlUEoO7+qLk7yXeATSd4OXAWc251/LvCRJBsZ9GCd0qBuSZKksbZgyKqqa4GnzNN+A4P5WXPbfwacPJLqJEmSJpQrvkuSJDVgyJIkSWrAkCVJktSAIUuSJKkBQ5YkSVIDhixJkqQGDFmSJEkNGLIkSZIaMGRJkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDRiyJEmSGjBkSZIkNWDIkiRJasCQJUmS1IAhS5IkqQFDliRJUgOGLEmSpAYMWZIkSQ0YsiRJkhowZEmSJDVgyJIkSWrAkCVJktSAIUuSJKkBQ5YkSVIDhixJkqQGDFmSJEkNLBiykhyc5NIkG5J8J8nruvYDklyS5Pruef+uPUnek2RjkmuTHNX6h5AkSRo3w/Rk3Qf8YVU9ETgaOCPJ4cCZwLqqWgWs6/YBTgBWdY81wPtHXrUkSdKYWzBkVdUtVfWtbvtuYAOwHDgRWNudthY4qds+EfhwDVwO7JfkoJFXLkmSNMYWNScryUrgKcA3gEdX1S0wCGLAgd1py4GbZn3Zpq5NkiRpagwdspLsBXwaeH1VbX2oU+dpq3m+35ok65Os37x587BlSJIkTYShQlaS3RkErI9W1We65ltnhgG759u69k3AwbO+fAVw89zvWVXnVNXqqlq9bNmy7a1fkiRpLA1zdWGAc4ENVfWXsw5dBJzWbZ8GXDir/RXdVYZHA1tmhhUlSZKmxW5DnHMM8HLguiRXd21/DLwDOD/J6cCNwMndsc8BzwE2Aj8BXjnSiiVJkibAgiGrqr7G/POsAI6b5/wCzlhiXZIkSRPNFd8lSZIaMGRJkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDRiyJEmSGjBkSZIkNWDIkiRJasCQJUmS1IAhS5IkqQFDliRJUgOGLEmSpAYMWZIkSQ0YsiRJkhowZEmSJDVgyJIkSWrAkCVJktSAIUuSJKkBQ5YkSVIDhixJkqQGDFmSJEkNGLIkSZIaMGRJkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDSwYspJ8KMltSb49q+2AJJckub573r9rT5L3JNmY5NokR7UsXpIkaVwN05P118Dxc9rOBNZV1SpgXbcPcAKwqnusAd4/mjIlSZImy4Ihq6q+Ctwxp/lEYG23vRY4aVb7h2vgcmC/JAeNqlhJkqRJsb1zsh5dVbcAdM8Hdu3LgZtmnbepa9tGkjVJ1idZv3nz5u0sQ5IkaTyNeuJ75mmr+U6sqnOqanVVrV62bNmIy5AkSerX9oasW2eGAbvn27r2TcDBs85bAdy8/eVJkiRNpu0NWRcBp3XbpwEXzmp/RXeV4dHAlplhRUmSpGmy20InJPk48B+BRyXZBLwFeAdwfpLTgRuBk7vTPwc8B9gI/AR4ZYOaJUmSxt6CIauqTv1XDh03z7kFnLHUoiRJkiadK75LkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDRiyJEmSGjBkSZIkNWDIkiRJasCQJUmS1IAhS5IkqQFDliRJUgOGLEmSpAYMWZIkSQ0YsiRJkhowZEmSJDVgyJIkSWrAkCVJktSAIUuSJKkBQ5YkSVIDhixJkqQGDFmSJEkNGLIkSZIaMGRJkiQ1YMiSJElqwJAlSZLUgCFLkiSpAUOWJElSA4YsSZKkBgxZkiRJDTQJWUmOT/L9JBuTnNniNSRJksbZyENWkl2B9wEnAIcDpyY5fNSvI0mSNM5a9GQ9DdhYVTdU1b3AJ4ATG7yOJEnS2GoRspYDN83a39S1SZIkTY3dGnzPzNNW25yUrAHWdLs/TvL9BrWMi0cBt++oF8s7d9QrTYUd+t7xtvl+fbQEO/Z37z/7/o3Yjv39i+/fCO3Q9+73PrCjXumXHjvMSS1C1ibg4Fn7K4Cb555UVecA5zR4/bGTZH1Vre67Di2e791k8/2bbL5/k8v3bqDFcOEVwKokhyTZAzgFuKjB60iSJI2tkfdkVdV9SX4P+CKwK/ChqvrOqF9HkiRpnLUYLqSqPgd8rsX3nlBTMSy6k/K9m2y+f5PN929y+d4BqdpmTrokSZKWyNvqSJIkNWDIkiRJasCQJUmS1IAhq4EkxwzTpvGXZP8kT+67Dkkad8m2S2HP1zZNDFltvHfINo2hJJcl2SfJAcA1wHlJ/rLvujScJGd179/uSdYluT3Jy/quSwtL8sIk1yfZkmRrkruTbO27Lg3t1+dpO2GHVzFGmizhMK2SPAN4JrAsyRtmHdqHwZphmgz7VtXWJL8NnFdVb0lybd9FaWjPrqo3JnkBgztQnAxcCvxNv2VpCGcBz6+qDX0XouEleQ3wWuDQOZ+VewNf76eq8WDIGq09gL0Y/LvuPat9K/CiXirS9tgtyUHAi4H/2ncxWrTdu+fnAB+vqjviPekmxa0GrIn0MeDzwF8AZ85qv7uq7uinpPFgyBqhqvpKkq8BT6qqt/Vdj7bbnzC4Y8HXquqKJIcC1/dck4b390m+B/wUeG2SZcDPeq5JDyHJC7vN9Uk+Cfwd8C8zx6vqM70UpqFU1RZgC3Bqkl2BRzPIF3sl2auqbuy1wB65GGkDSb5cVcf2XYc0rZLsD2ytqvuT7AnsXVX/r++6NL8k5z3E4aqqV+2wYrTdulvqvRW4FfhF11xVNbUXDxmyGkjyLmAV8LfAPTPt/jU2GZKcBbydQU/IF4AjgNdXlXN6JkCSM4CPVtVd3f7+wKlV9b/6rUzauSXZCDy9qn7Udy3jwqsL2zgA+BFwLPD87vG8XivSYjy7qrYyeM82AY8H/qjfkrQIr54JWABVdSfw6h7r0ZCSrE2y36z9/ZN8qM+atCg3MRg2VMc5WQ1U1Sv7rkFL4sTpybZLklTXTd/NEdmj55o0nCfPDchJntJnQVqUG4DLknyWB8+pm9olcOzJaiDJiiQXJLktya1JPp1kRd91aWgzE6dXA+ucOD1xvgicn+S4JMcCH2cw7Kvxt0s3vAtAt1adnQGT40bgEgZ/1Ow96zG1nJPVQJJLGFzS+pGu6WXAS6tqvoXaNIacOD25kuwC/A5wHBDgS8AHq+r+XgvTgpK8Angz8CmgGCyj8mdV9ZGH/EKNlSR7VtU9C5+58zNkNZDk6qo6cqE2jackjwDeADymqtYkWQUcVlUX91yatNNLcjiD+awB1lXVd3suSUPqFuQ+F9irqh6T5Ajgd6rqtT2X1huHC9u4PcnLkuzaPV7GYCK8JsN5wL0MVu+HweT3t/dXjoaR5Pzu+bok18599F2fhnYAcE9VvRfYnOSQvgvS0N4N/Abd/3dVdQ3w73utqGeOdbfxKuB/Amd3+1/v2jQZHldVL0lyKkBV/TTOfJ8Er+uevZJ3QiV5C4O5kIcx+GNndwa3Qzqmz7o0vKq6ac7H5VQP0xuyGuhWt/3NvuvQdrs3ya8wmBNCkscx60oZjaequqXbfG1VvWn2sSTvBN607VdpzLwAeArwLYCqujnJVE+cnjA3JXkmUEn2AP4AmOrbJDlc2ECSQ5P8fZLN3RWGF3a3ZtFkeAuDq9EOTvJRYB3wxn5L0iLMd4HJCTu8Cm2Pe7ulN2b+wNmz53q0OL8LnAEsZzDN4shuf2o58b2BJJcD72Nw6TjAKcDvV9XT+6tKi5HkkcDRDCbfXl5Vt/dckhaQ5DXAa4FDgX+adWhv4OtV9bJeCtPQkvwXBnfL+HUGNxt+FfCxbn6WNHEMWQ0k+cbcQJXk8qo6uq+atDhJlgOPZdaQelV9tb+KtJAk+wL7M/jP+cxZh+6uqjv6qUqL0Q3r/gPwbAZ/4HwReNbc4V+Np+4ihd8HVvLgz86pnT5jyGogyTuAu4BPMOj2fgnwMAa9W/iBP966D/qXAN/hwTc5ndoPikmQZJ+q2totYLkNf+/GX5JvVdVRc9quneYbDE+SJNcwWMLhOh747KSqvtJbUT0zZDWQ5J9n7c78A89cblFV5fysMZbk+wxu7+Fk9wmS5OKqel73+1c88DsH/t6NNYd6dw7zjeJMO0NWA0leDHyh+6v6vwNHAX9aVd/quTQNIcnngZOr6sd91yJNA4d6dw5J/hODOXVf4sH3Lpza//sMWQ3MdG8n+XfAnwPvAv7YhD8ZknwaOILBVYWzPyj+oLeitKAkRz3U8Wn+oJd2hCR/AbycQW/k7KkWx/ZXVb9cJ6uNmcXXngv8VVVdmOStPdajxbmoe2iyvOshjhWDW7VIaucFwKFVdW/fhYwLe7IaSHIx8H+BZwFPBX4KfLOqjui1MEmSGknySQbLFd3Wdy3jwpDVQHeD4eOB66rq+iQHAU+qqi/1XJoeQpLzq+rFSa7jgQsWYDCBurzCaTIk2R14DQ/cM+0y4ANV9fPeipKmQJLLgCcDV/DgqRZTe2W2IUvqJDmoqm5J8tj5jlfVD3d0TVq8JB9kcM+7tV3Ty4H7q+q3+6tK2vkl+Q/ztbuEg6Rf6m7l8dOq+kWSxwNPAD5vT8hkSHLN3KH5+dokqTXvXSht66vAw7tV39cBrwT+uteKtBj3dzf1Bgb3EuWBi1EkNZLkhUmuT7IlydYkdyfZ2nddffLqQmlbqaqfJDkdeG9VnZXkqr6L0tD+CLg0yQ3d/koGQVlSW2cBz6+qDX0XMi7syZK2lSTPAF4KfLZr8w+SyfF14AMM1un5Rbf9j71WJE2HWw1YD+Z/HNK2Xg+8Gbigqr7TDTdd2nNNGt6Hga3An3b7pwIfAU7urSJpOqzvlnH4Ox58deFn+iupX058l7RTceK71I8k583TXFX1qh1ezJiwJ0uaI8mlPHidLACm+dYQE+aqJEdX1eUASZ7OYAhRUkNV5dzHOezJkuZI8tRZuw8Hfgu4r6re2FNJWoQkG4DDgBu7pscAGxjMz3JRWWnEkryxu0Dovcz/B+rU3vfVnixpjqq6ck7T15NM7WJ6E+j4vguQpszMZPf1vVYxhuzJkuZIcsCs3V2A1cD/qKrDeipJkjSB7MmStnUlD3R53wf8ADi9t2okaQIkWQa8CTicwVQLYLrns7pOlrStw4H3AdcA3wY+j93gkrSQjzIYOjwEeBuDP1Cv6LOgvjlcKM2R5HwG6yx9tGs6Fdi/qlxnSZL+FUmurKqnJrl25gKTJF+pqnlvHD0NHC6UtnXYnDWVLk1yTW/VSNJk+Hn3fEuS5wI3Ayt6rKd3hixpW66zJEmL9/Yk+wJ/CLwX2IfBHTSmliFL6iS5jsGE992BVyS5sdt/LPDdPmuTpAlwZ1VtAbYAvwaQ5Jh+S+qXc7KkTpLHPtTxqvrhjqpFkiZNkm9V1VELtU0Te7KkjiFKkhYvyTOAZwLLkrxh1qF9gF37qWo8GLIkSdJS7AHsxSBT7D2rfSvwol4qGhMOF0qSpCVJsivwyaqa6lA1l4uRSpKkJamq+4EDFjxxyjhcKEmSRuGqJBcBfwvcM9NYVZ/pr6R+GbIkSdIoHAD8CJh9r8ICpjZkOSdLkiSpAedkSZKkJUvy+CTrkny7239ykv/Wd119MmRJkqRR+N/Am+nuYVhV1wKn9FpRzwxZkiRpFB5RVd+c03ZfL5WMCUOWJEkahduTPI7BZHeSvAi4pd+S+uXEd0mStGRJDgXOYXCLnTuBfwZeOs23LHMJB0mSNApVVc9KsiewS1XdneSQvovqk8OFkiRpFD4NUFX3VNXdXduneqynd/ZkSZKk7ZbkCcC/AfZN8sJZh/YBHt5PVePBkCVJkpbiMOB5wH7A82e13w28upeKxoQT3yVJ0pIleUZV/WPfdYwTQ5YkSVqyJMsY9FytZNZIWVW9qq+a+uZwoSRJGoULgf8D/ANwf8+1jAV7siRJ0pIlubqqjuy7jnHiEg6SJGkULk7ynL6LGCf2ZEmSpCVLcjfwCOBeBjeJDoMFSvfptbAeOSdLkiSNwr7AS4FDqupPkjwGOKjnmnplT5YkSVqyJO8HfgEcW1VPTLI/8KWq+rc9l9Ybe7IkSdIoPL2qjkpyFUBV3Zlkj76L6pMT3yVJ0ij8PMmuQMEv1836Rb8l9cuQJUmSRuE9wAXAgUn+DPga8Of9ltQv52RJkqSR6G4WfRyDKwvXVdWGnkvqlSFLkiSpAYcLJUmSGjBkSZIkNWDIkiRJasCQJUmS1IAhS5IkqYH/D0IZdhg/TexeAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,4))\n", + "df.type.value_counts().plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It turns out that the number of documents varies from around 400 to 500 for each class, thus we can say that the sample is pretty balanced." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df['class'] = df['type'].factorize()[0] # create a column with classes encoded in digits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I have spotted that new line symbols are presented as \"\\n\" in the docs and decided to filter them, because they do not carry a lot of information." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "def scrub_words(text):\n", + " \"\"\"Basic cleaning of texts.\"\"\"\n", + " \n", + " # remove html markup\n", + " text=re.sub(\"\\n\",\"\",text)\n", + " text=text.strip()\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "clean_news = [scrub_words(w) for w in df.news]\n", + "clean_news = pd.DataFrame(clean_news)\n", + "df['news'] = clean_news" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using tfidf vectorizer is equivalent to count vectorizer with tfidf transformer, but a little bit simpler." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2225, 34797)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tf_idf = TfidfVectorizer(sublinear_tf=True, min_df=3, norm='l2', encoding='latin-1', ngram_range=(1, 3), stop_words='english')\n", + "features = tf_idf.fit_transform(df.news).toarray()\n", + "labels = df['class']\n", + "features.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Data split" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df['news'], df['class'], test_size=0.15, random_state = 72)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I could have done all the models in one pipeline and set grid parameters for multiple classifiers, but I think it would be better to run them manually one by one." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# instead of doing these steps one at a time, we can use a pipeline to complete then all at once\n", + "pipeline1 = Pipeline([('vect', tf_idf),\n", + " ('chi', SelectKBest(chi2, k=5000)),\n", + " ('clf', RandomForestClassifier(n_estimators=100))])\n", + "\n", + "pipeline2 = Pipeline([('vect', tf_idf),\n", + " ('chi', SelectKBest(chi2, k=5000)),\n", + " ('clf', LogisticRegression(penalty='l2',multi_class='multinomial',solver='lbfgs'))])\n", + "\n", + "pipeline3 = Pipeline([('vect', tf_idf),\n", + " ('chi', SelectKBest(chi2, k=5000)),\n", + " ('clf', LinearSVC())])\n", + "pipeline4 = Pipeline([('vect', tf_idf),\n", + " ('chi', SelectKBest(chi2, k=5000)),\n", + " ('clf', MLPClassifier())])\n", + "\n", + "ytest = np.array(y_test)\n", + "\n", + "# confusion matrix and classification report(precision, recall, F1-score)\n", + "#print(classification_report(ytest, model.predict(X_test)))\n", + "#print(confusion_matrix(ytest, model.predict(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model results" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.99 0.96 77\n", + " 1 0.98 0.95 0.96 56\n", + " 2 1.00 0.94 0.97 62\n", + " 3 0.99 1.00 0.99 85\n", + " 4 0.96 0.96 0.96 54\n", + "\n", + " micro avg 0.97 0.97 0.97 334\n", + " macro avg 0.97 0.97 0.97 334\n", + "weighted avg 0.97 0.97 0.97 334\n", + "\n" + ] + } + ], + "source": [ + "model_RF = pipeline1.fit(X_train,y_train)\n", + "print(classification_report(ytest, model_RF.predict(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.96 1.00 0.98 77\n", + " 1 0.98 0.95 0.96 56\n", + " 2 1.00 0.98 0.99 62\n", + " 3 0.99 1.00 0.99 85\n", + " 4 0.98 0.96 0.97 54\n", + "\n", + " micro avg 0.98 0.98 0.98 334\n", + " macro avg 0.98 0.98 0.98 334\n", + "weighted avg 0.98 0.98 0.98 334\n", + "\n" + ] + } + ], + "source": [ + "model_LR = pipeline2.fit(X_train,y_train)\n", + "print(classification_report(ytest, model_LR.predict(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.99 0.99 0.99 77\n", + " 1 0.98 0.98 0.98 56\n", + " 2 1.00 1.00 1.00 62\n", + " 3 1.00 1.00 1.00 85\n", + " 4 0.98 0.98 0.98 54\n", + "\n", + " micro avg 0.99 0.99 0.99 334\n", + " macro avg 0.99 0.99 0.99 334\n", + "weighted avg 0.99 0.99 0.99 334\n", + "\n" + ] + } + ], + "source": [ + "model_SVC = pipeline3.fit(X_train,y_train)\n", + "print(classification_report(ytest, model_SVC.predict(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.97 0.99 0.98 77\n", + " 1 0.98 0.98 0.98 56\n", + " 2 1.00 0.98 0.99 62\n", + " 3 1.00 1.00 1.00 85\n", + " 4 0.98 0.98 0.98 54\n", + "\n", + " micro avg 0.99 0.99 0.99 334\n", + " macro avg 0.99 0.99 0.99 334\n", + "weighted avg 0.99 0.99 0.99 334\n", + "\n" + ] + } + ], + "source": [ + "model_MLP = pipeline4.fit(X_train,y_train)\n", + "print(classification_report(ytest, model_MLP.predict(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setting up the parameters for all the models." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "parameters1 = {\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3)), # trigrams or bigrams\n", + " 'chi__k': (2500,5000,\"all\"),\n", + " 'clf__n_estimators': (100,150)\n", + "}\n", + "parameters2 = {\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3)), # trigrams or bigrams\n", + " 'chi__k': (2500,5000,'all'),\n", + " 'clf__C': (0.1, 1,100),\n", + " 'clf__solver': (['saga','lbfgs'])\n", + "}\n", + "parameters3 = {\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3)), # trigrams or bigrams\n", + " 'chi__k': (2500,5000,'all'),\n", + " 'clf__multi_class':['ovr','crammer_singer'],\n", + " 'clf__C': [0.01, 1, 100]\n", + "}\n", + "parameters4 = {\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3)), # trigrams or bigrams\n", + " 'chi__k': (2500,5000,'all'),\n", + " 'clf__solver': (['sgd','lbfgs','adam'])\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing grid search...\n", + "pipeline: ['vect', 'chi', 'clf']\n", + "parameters:\n", + "{'chi__k': (2500, 5000, 'all'),\n", + " 'clf__n_estimators': (100, 150),\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3))}\n", + "Fitting 3 folds for each of 36 candidates, totalling 108 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 1.0min\n", + "[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 3.3min finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done in 205.180s\n", + "\n", + "Best score for Random Forest: 0.961\n", + "Best parameters set:\n", + "\tchi__k: 5000\n", + "\tclf__n_estimators: 150\n", + "\tvect__min_df: 2\n", + "\tvect__ngram_range: (1, 3)\n" + ] + } + ], + "source": [ + "\n", + "if __name__ == \"__main__\":\n", + " # multiprocessing requires the fork to happen in a __main__ protected\n", + " # block\n", + "\n", + " # find the best parameters for both the feature extraction and the\n", + " # classifier\n", + " grid_search = GridSearchCV(pipeline1, parameters1, cv=3,\n", + " n_jobs=-1, verbose=1,scoring='f1_macro')\n", + "\n", + " print(\"Performing grid search...\")\n", + " print(\"pipeline:\", [name for name, _ in pipeline1.steps])\n", + " print(\"parameters:\")\n", + " pprint(parameters1)\n", + " t0 = time()\n", + " grid_search.fit(X_train, y_train)\n", + " print(\"done in %0.3fs\" % (time() - t0))\n", + " print()\n", + "\n", + " print(\"Best score for Random Forest: %0.3f\" % grid_search.best_score_)\n", + " print(\"Best parameters set:\")\n", + " best_parameters = grid_search.best_estimator_.get_params()\n", + " for param_name in sorted(parameters1.keys()):\n", + " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing grid search...\n", + "pipeline: ['vect', 'chi', 'clf']\n", + "parameters:\n", + "{'chi__k': (2500, 5000, 'all'),\n", + " 'clf__C': (0.1, 1, 100),\n", + " 'clf__solver': ['saga', 'lbfgs'],\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3))}\n", + "Fitting 5 folds for each of 108 candidates, totalling 540 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 1.0min\n", + "[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 5.2min\n", + "[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 26.5min\n", + "[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 38.8min finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done in 2335.243s\n", + "\n", + "Best score for Logistic Regression: 0.985\n", + "Best parameters set:\n", + "\tchi__k: 5000\n", + "\tclf__C: 100\n", + "\tclf__solver: 'lbfgs'\n", + "\tvect__min_df: 2\n", + "\tvect__ngram_range: (1, 2)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Games\\Python\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n", + " \"of iterations.\", ConvergenceWarning)\n" + ] + } + ], + "source": [ + "\n", + "if __name__ == \"__main__\":\n", + " # multiprocessing requires the fork to happen in a __main__ protected\n", + " # block\n", + "\n", + " # find the best parameters for both the feature extraction and the\n", + " # classifier\n", + " grid_search = GridSearchCV(pipeline2, parameters2, cv=5,\n", + " n_jobs=-1, verbose=1,scoring='f1_macro')\n", + "\n", + " print(\"Performing grid search...\")\n", + " print(\"pipeline:\", [name for name, _ in pipeline2.steps])\n", + " print(\"parameters:\")\n", + " pprint(parameters2)\n", + " t0 = time()\n", + " grid_search.fit(X_train, y_train)\n", + " print(\"done in %0.3fs\" % (time() - t0))\n", + " print()\n", + "\n", + " print(\"Best score for Logistic Regression: %0.3f\" % grid_search.best_score_)\n", + " print(\"Best parameters set:\")\n", + " best_parameters = grid_search.best_estimator_.get_params()\n", + " for param_name in sorted(parameters2.keys()):\n", + " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing grid search...\n", + "pipeline: ['vect', 'chi', 'clf']\n", + "parameters:\n", + "{'chi__k': (2500, 5000, 'all'),\n", + " 'clf__C': [0.01, 1, 100],\n", + " 'clf__multi_class': ['ovr', 'crammer_singer'],\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3))}\n", + "Fitting 5 folds for each of 108 candidates, totalling 540 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 1.0min\n", + "[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 6.1min\n", + "[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 14.6min\n", + "[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 21.4min finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done in 1285.196s\n", + "\n", + "Best score for Linear SVC: 0.985\n", + "Best parameters set:\n", + "\tchi__k: 5000\n", + "\tclf__C: 1\n", + "\tclf__multi_class: 'ovr'\n", + "\tvect__min_df: 5\n", + "\tvect__ngram_range: (1, 2)\n" + ] + } + ], + "source": [ + "\n", + "if __name__ == \"__main__\":\n", + " # multiprocessing requires the fork to happen in a __main__ protected\n", + " # block\n", + "\n", + " # find the best parameters for both the feature extraction and the\n", + " # classifier\n", + " grid_search = GridSearchCV(pipeline3, parameters3, cv=5,\n", + " n_jobs=-1, verbose=1,scoring='f1_macro')\n", + "\n", + " print(\"Performing grid search...\")\n", + " print(\"pipeline:\", [name for name, _ in pipeline3.steps])\n", + " print(\"parameters:\")\n", + " pprint(parameters3)\n", + " t0 = time()\n", + " grid_search.fit(X_train, y_train)\n", + " print(\"done in %0.3fs\" % (time() - t0))\n", + " print()\n", + "\n", + " print(\"Best score for Linear SVC: %0.3f\" % grid_search.best_score_)\n", + " print(\"Best parameters set:\")\n", + " best_parameters = grid_search.best_estimator_.get_params()\n", + " for param_name in sorted(parameters3.keys()):\n", + " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing grid search...\n", + "pipeline: ['vect', 'chi', 'clf']\n", + "parameters:\n", + "{'chi__k': (2500, 5000, 'all'),\n", + " 'clf__solver': ['sgd', 'lbfgs', 'adam'],\n", + " 'vect__min_df': (2, 3, 5),\n", + " 'vect__ngram_range': ((1, 2), (1, 3))}\n", + "Fitting 5 folds for each of 54 candidates, totalling 270 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2min\n", + "[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 54.6min\n", + "[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 137.1min finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done in 8254.961s\n", + "\n", + "Best score for Multilayer Perceptron: 0.983\n", + "Best parameters set:\n", + "\tchi__k: 5000\n", + "\tclf__solver: 'adam'\n", + "\tvect__min_df: 3\n", + "\tvect__ngram_range: (1, 2)\n" + ] + } + ], + "source": [ + "\n", + "if __name__ == \"__main__\":\n", + " # multiprocessing requires the fork to happen in a __main__ protected\n", + " # block\n", + "\n", + " # find the best parameters for both the feature extraction and the\n", + " # classifier\n", + " grid_search = GridSearchCV(pipeline4, parameters4, cv=5,\n", + " n_jobs=-1, verbose=1,scoring='f1_macro')\n", + "\n", + " print(\"Performing grid search...\")\n", + " print(\"pipeline:\", [name for name, _ in pipeline4.steps])\n", + " print(\"parameters:\")\n", + " pprint(parameters4)\n", + " t0 = time()\n", + " grid_search.fit(X_train, y_train)\n", + " print(\"done in %0.3fs\" % (time() - t0))\n", + " print()\n", + "\n", + " print(\"Best score for Multilayer Perceptron: %0.3f\" % grid_search.best_score_)\n", + " print(\"Best parameters set:\")\n", + " best_parameters = grid_search.best_estimator_.get_params()\n", + " for param_name in sorted(parameters4.keys()):\n", + " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Best performing once again" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.97 1.00 0.99 77\n", + " 1 0.98 0.98 0.98 56\n", + " 2 1.00 0.98 0.99 62\n", + " 3 1.00 1.00 1.00 85\n", + " 4 1.00 0.98 0.99 54\n", + "\n", + " micro avg 0.99 0.99 0.99 334\n", + " macro avg 0.99 0.99 0.99 334\n", + "weighted avg 0.99 0.99 0.99 334\n", + "\n" + ] + } + ], + "source": [ + "pip_fin = Pipeline([('vect', TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')),\n", + " ('chi', SelectKBest(chi2, k=5000)),\n", + " ('clf', LinearSVC(C = 1, multi_class = 'ovr'))])\n", + "mod_fin = pip_fin.fit(X_train,y_train)\n", + "print(classification_report(ytest, mod_fin.predict(X_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Games\\Python\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", + " \"this warning.\", FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.99 0.99 0.99 77\n", + " 1 0.98 0.98 0.98 56\n", + " 2 1.00 1.00 1.00 62\n", + " 3 1.00 1.00 1.00 85\n", + " 4 0.98 0.98 0.98 54\n", + "\n", + " micro avg 0.99 0.99 0.99 334\n", + " macro avg 0.99 0.99 0.99 334\n", + "weighted avg 0.99 0.99 0.99 334\n", + "\n" + ] + } + ], + "source": [ + "pip_fin2 = Pipeline([('vect', TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')),\n", + " ('chi', SelectKBest(chi2, k=5000)),\n", + " ('clf', LogisticRegression(C = 100, solver = 'lbfgs'))])\n", + "mod_fin2 = pip_fin2.fit(X_train,y_train)\n", + "print(classification_report(ytest, mod_fin2.predict(X_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Conclusions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Regarding the results before and after tuning, it can be stated that all the models perform very well with f1-scores more than 0.96. While RF model tends to use trigrams to show its best result, other models are using bigrams and have higher f1, precision and recall macro metrics. The best identified models are Logistic Regression and Linear Support Vector Classification with f1-macro 0.985 and used bigrams and 5000 features." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "According to final results, Logistic regression perfoms the best and predicts news types politics and sport 100% correctly." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/NLP/nlp_hw_3/Sentiment_movie.ipynb b/NLP/nlp_hw_3/Sentiment_movie.ipynb new file mode 100644 index 0000000..110e6b4 --- /dev/null +++ b/NLP/nlp_hw_3/Sentiment_movie.ipynb @@ -0,0 +1,734 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NLP Homework Task №3: Movie sentiment analysis\n", + "*Author: Solonin Maxim*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The task:\n", + "\n", + "1. Choose and argue your measure of a test's accuracy.\n", + "2. Build data processing and classification pipeline; Please compare word-embeddings vs classical methods.\n", + "3. Tune your model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", + "from sklearn.metrics import accuracy_score, f1_score, classification_report\n", + "from sklearn.svm import LinearSVC\n", + "from tqdm import tqdm\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.pipeline import Pipeline\n", + "from pprint import pprint\n", + "from time import time\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.corpus import stopwords\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import random\n", + "import gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "train = pd.read_csv('train.tsv', sep = '\\t')\n", + "test = pd.read_csv('test.tsv', sep = '\\t')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exploratory data analysis " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The number of unique sentences is: 8529\n", + "The number of Phrases is: 156060\n" + ] + } + ], + "source": [ + "print(\"The number of unique sentences is: {}\".format(train.SentenceId.nunique()))\n", + "print(\"The number of Phrases is: {}\".format(train.PhraseId.nunique()))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 79582\n", + "3 32927\n", + "1 27273\n", + "4 9206\n", + "0 7072\n", + "Name: Sentiment, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.Sentiment.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAADWCAYAAAAJgFGRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAbDUlEQVR4nO3de7xWZZn/8c8XVCxlg/YTyNTR3zTj\n+ZB4Qg0dEaW0wt9UNtokdnA08DxpTpaKTjhqhAFjk5qHxsZmsuhXmtvwMM4goqB4CDQtPCKYonuD\nclC45o97PePi6WGz97MfuPfh+369ntez131fa63rWeXr4l7rXmspIjAzM7ONq0/uBMzMzHojF2Az\nM7MMXIDNzMwycAE2MzPLwAXYzMwsAxdgMzOzDFyAzczMMnABNjMzy2CT3An0FJIEbAsszZ2LmZll\n1x9YGG087coFuHG2BV7KnYSZmXUZ2wEvr6vTBbhxlgK8+OKLNDU15c7FzMwyaW1tZfvtt4f1nBHN\nWoAl9QUuBj4PDAEWAjcCl1WG7cWp3UuArwADgRnAaRHxTGk7WwOTgU8Aa4DbgDMjYlkpZi9gKrA/\n8EdgckRcUZXPZ4BLgR2BZ4DzI+KOjvympqYmF2AzM1uv3JOwzgdOA8YBuxbL5wGnl2LOA84ATgUO\nBN4CmiVtXoq5BdgdGAkcCwwHflDplNQE3AU8DwwFvgZcLOmUUszBwL8B1wMfAaYB0yTt0bifa2Zm\nlijn25Ak/QpYHBFfKrXdBiyPiM8Xo9+FwHci4qqifwCwGBgTEbdK2hWYB+wfEbOLmFHAHcB2EbFQ\n0mnAPwJDImJVEXM5MDoidimWfwJsERHHlnJ5EJgbEae247c0AS0tLS0eAZuZ9WKtra0MGDAAYEBE\ntK4rLvcI+AFghKS/BJC0N3Ao8OuifyfSqenplRUiogWYBQwrmoYBb1aKb2E66VT0gaWY+yvFt9AM\n7Cxpq1LMdNbWXNrPWiT1k9RU+ZBmvJmZmbVL7klYlwNNwFOSVgN9gW9ExC1F/5Die3HVeotLfUOA\nV8udEfGupCVVMQtqbKPS90bx3dZ+ql0AXLSOPjMzszblLsCfBU4ETgB+C+wDTJK0MCJuyprZ+k0A\nJpaW++PbkKyLOGTyIblTyGLG6TNyp2DWbrkL8JXA5RFxa7H8hKQ/I40ubwIWFe2DgVdK6w0G5hZ/\nLwIGlTcqaRNg69L6i4p1ygaX+tqKWUQNEbESWFnaZ60wMzOzmnJfA34/6Vpt2Wrey2sBqQCOqHQW\n11sPBGYWTTOBgZKGlrZxRLGNWaWY4ZI2LcWMBJ6OiDdKMSNY28jSfszMzBomdwH+JfANScdI2lHS\nccA5wM8BinuBJwEXSvqkpD2Bm0kzo6cVMfOBO4FrJR0g6RBgCnBrRCws9vNjYBVwvaTdJR0PnMna\np5CvBkZJOlfSLpIuBvYrtmVmZtZQuU9Bn0568MU/k04jLwT+BRhfirkC2IJ0X+9A4L+BURGxohRz\nIqlQ3s17D+I4o9IZES2SjiI9iGMO8BowPiJ+UIp5QNIJwGXAt0kP4hgdEU828gebmZlB5vuAexLf\nB2xdiSdhmeXTXe4DNjMz65VcgM3MzDJwATYzM8vABdjMzCwDF2AzM7MMXIDNzMwycAE2MzPLwAXY\nzMwsAxdgMzOzDFyAzczMMnABNjMzy8AF2MzMLAMXYDMzswxcgM3MzDJwATYzM8vABdjMzCwDF2Az\nM7MMXIDNzMwycAE2MzPLwAXYzMwsAxdgMzOzDFyAzczMMnABNjMzy8AF2MzMLAMXYDMzswyyF2BJ\nH5L0r5Jel7Rc0hOS9iv1S9J4Sa8U/dMl/UXVNraWdIukVklvSrpe0pZVMXtJ+i9JKyS9KOm8Grl8\nRtJTRcwTkj6+4X65mZn1ZlkLsKStgBnAO8DHgN2Ac4E3SmHnAWcApwIHAm8BzZI2L8XcAuwOjASO\nBYYDPyjtpwm4C3geGAp8DbhY0imlmIOBfwOuBz4CTAOmSdqjcb/YzMws2STz/s8HXoyIk0ttCyp/\nSBJwFnBZRPyiaPsCsBgYDdwqaVdgFLB/RMwuYk4H7pD09xGxEDgR2Az4YkSsAn4raR/gHN4r1GcC\nd0bElcXyNyWNBMaRir+ZmVnD5D4F/UlgtqT/kPSqpEclfaXUvxMwBJheaYiIFmAWMKxoGga8WSm+\nhenAGtKIuRJzf1F8K5qBnYtReCVmOmtrLu1nLZL6SWqqfID+7fvJZmZm+Qvw/wVOA54BjgauAb4n\n6aSif0jxvbhqvcWlviHAq+XOiHgXWFIVU2sbtCNmCLVdALSUPi+tI87MzOxP5C7AfYBHIuIfIuLR\niPgBcC3d45TvBGBA6bNd3nTMzKw7yV2AXwHmVbXNB3Yo/l5UfA+uihlc6lsEDCp3StoE2LoqptY2\naEfMImqIiJUR0Vr5AEtrxZmZmdWSuwDPAHauavtL0mxlSBOyFgEjKp3F9dYDgZlF00xgoKShpW0c\nQfpts0oxwyVtWooZCTwdEW+UYkawtpGl/ZiZmTVM7gL8XeAgSf8g6cOSTgBOAaYCREQAk4ALJX1S\n0p7AzcBC0m1CRMR84E7gWkkHSDoEmALcWsyABvgxsAq4XtLuko4nzXqeWMrlamCUpHMl7SLpYmC/\nYltmZmYNlbUAR8TDwHHA3wBPAt8EzoqIW0phVwCTSbcLPQxsCYyKiBWlmBOBp4C7gTuA/yYV8sp+\nWoCjSLOq5wDfAcYX15wrMQ8AlX8APAZ8GhgdEU828CebmZkBoDTItM4qTo23tLS00NTUlDsd6+UO\nmXxI7hSymHH6jNwpmNHa2sqAAQMABhRzhGrKfQrazMysV3IBNjMzy8AF2MzMLAMXYDMzswxcgM3M\nzDJwATYzM8vABdjMzCwDF2AzM7MM6irAkv4g6QM12gdK+kPn0zIzM+vZ6h0B7wj0rdHeD/hQ3dmY\nmZn1Ept0JFjSJ0uLR0tqKS33Jb1N6LkG5GVmZtajdagAU7yBCAjgpqq+d0jF99xO5mRmZtbjdagA\nR0QfAEkLgP0j4rUNkpWZmVkP19ERMAARsVOjEzEzM+tN6irAAJJGkK75DqJqMldEfLGTeZmZmfVo\ndRVgSRcB3wJmA6+QrgmbmZlZO9U7Aj4VGBMRP2pkMmZmZr1FvfcBbwY80MhEzMzMepN6C/B1wAmN\nTMTMzKw3qfcU9ObAKZKOBB4n3QP8vyLinM4mZmZm1pPVW4D3AuYWf+9R1ecJWWZmZutR733Af9Xo\nRMzMzHoTv47QzMwsg3rvA76XNk41R8QRdWdkZmbWC9R7DXhu1fKmwD6k68HVL2kwMzOzKnWdgo6I\ns6s+4yLiUGASVTOi20vS1yWFpEmlts0lTZX0uqRlkm6TNLhqvR0k3S7pbUmvSrpS0iZVMYdLekTS\nSknPShpTY/9jJT0naYWkWZIOqOd3mJmZtUejrwH/K9Dh50BL2h/4O9ItTWXfBT4BfAY4DNgW+Flp\nvb7A7aQHgxwMnASMAcaXYnYqYu4ljdInAddJOroUczwwEbgE2Bd4DGiWNKijv8XMzKw9Gl2AhwEr\nOrKCpC2BW4CvAG+U2gcAXwLOiYh7ImIOcDJwsKSDirCjgN2Az0fE3Ij4NfBNYKykzYqYU4EFEXFu\nRMyPiCnAT4GzS2mcA1wbETdExLxinbep4x8TZmZm7VHvJKyfVTcBHwT2Ay7t4OamArdHxHRJF5ba\nh5KuLU+vNETEU5JeIBX6B4vvJyJicWm9ZuAaYHfg0SJmOmtrJo2EKQr1UGBCaT9rJE0v1q1JUj+g\nX6mpf7t+rZmZGfVPwmqpWl4DPA18KyLuau9GJH2OdMp3/xrdQ4BVEfFmVfvioq8Ss7hGP+2IaZL0\nPmAroO86YnZpI/0LgIva6DczM1uneh/EcXJndyxpe+BqYGREdOi0dRcxgXTduKI/8FKmXMzMrJup\ndwQMgKShwK7F4m8j4tEOrD4UGAQ8IqnS1hcYLmkccDSwmaSBVaPgwcCi4u9FQPVs5cGlvsr34Box\nrRGxXNJqYPU6YhaxDhGxElhZWS79BjMzs/WqaxKWpEGS7gEeBr5XfOZIulvSNu3czN3AnqSZyZXP\nbNKErMrf7wAjSvvdGdgBmFk0zQT2rJqtPBJoBeaVYkawtpGVbUTEKmBO1X76FMszMTMz2wDqHQFP\nJp1y3T0i5gNI2o30EI7vAX+zvg1ExFLgyXKbpLeA1yPiyWL5emCipCWkojoZmBkRDxar3EUqtD+S\ndB7peu9lwNRihArwfWCcpCuAHwJHAJ8FjinteiJwk6TZwEPAWcAWwA3tPiJmZmYdUG8BHgUcWSm+\nABExT9JYUlFslLNJE7xuI804bga+WtrnaknHkmY9zwTeIv0j4FulmAWSjiHdU3wm6TrtlyOiuRTz\nk2LkPp5UxOcCo6pmV5uZmTVMvQW4D7WfePUOnbi3OCIOr1peAYwtPuta53ng4+vZ7n3AR9YTMwWY\n0s5UzczMOqXeYnkPcLWkbSsNkj5EGmXe3YjEzMzMerJ6C/A4oAl4TtLvJf0eWFC0nd6o5MzMzHqq\neu8DflHSvsCRvPewivkRUf3EKTMzM6uhQyNgSUdImiepKZLfRMTkiJgMPCzpt5I+uoFyNTMz6zE6\negr6LNJLC1qrOyKiBfgX0osNzMzMrA0dLcB7A3e20X8X6QlXZmZm1oaOXgMeTO3bjyreBdr7JCwz\nsy7jP4cfljuFLA67/z9zp9BrdXQE/DKwRxv9ewGv1J+OmZlZ79DRAnwHcKmkzas7ilf7XQL8qhGJ\nmZmZ9WQdPQV9GfD/gN9JmkJ6BzCkW5HGkt5m9I+NS8/MzKxn6lABjojFkg4mPXt5AlB5B1+QntM8\n1s9PNjMzW78OP4ij8uxlSVsBHyYV4Wci4o1GJ2dmZtZT1fsyBoqC+3ADczEzM+s16n5zkZmZmdXP\nBdjMzCwDF2AzM7MM6r4GbLYxvDB+z9wpZLHDt57InYKZbWAeAZuZmWXgAmxmZpaBC7CZmVkGLsBm\nZmYZuACbmZll4AJsZmaWgQuwmZlZBi7AZmZmGWQtwJIukPSwpKWSXpU0TdLOVTGbS5oq6XVJyyTd\nJmlwVcwOkm6X9HaxnSslbVIVc7ikRyStlPSspDE18hkr6TlJKyTNknTABvnhZmbW6+UeAR8GTAUO\nAkYCmwJ3SdqiFPNd4BPAZ4r4bYGfVTol9QVuBzYDDgZOAsYA40sxOxUx9wL7AJOA6yQdXYo5HpgI\nXALsCzwGNEsa1MgfbGZmBpkfRRkRo8rLxaj0VWAocL+kAcCXgBMi4p4i5mRgvqSDIuJB4ChgN+DI\niFgMzJX0TeCfJF0cEauAU4EFEXFusav5kg4Fzgaai7ZzgGsj4oZiP6cCxwBfBC7fMEfAzMx6q9wj\n4GoDiu8lxfdQ0qh4eiUgIp4CXgCGFU3DgCeK4lvRDDQBu5diprO25so2JG1W7Ku8nzXF8jDMzMwa\nrMu8jEFSH9Kp4RkR8WTRPARYFRFvVoUvLvoqMYtr9NOOmCZJ7wO2AvquI2aXdeTbD+hXaupfK87M\nzKyWrjQCngrsAXwudyLtdAHQUvq8lDcdMzPrTrpEAZY0BTgW+KuIKBeyRcBmkgZWrTK46KvEDK7R\nTztiWiNiOfAasHodMYuobQLplHnls9064szMzP5E7tuQVBTf44AjImJBVcgc4B1gRGmdnYEdgJlF\n00xgz6rZyiOBVmBeKWYEaxtZ2UYxUWtO1X76FMszqSEiVkZEa+UDLG3XjzYzMyP/NeCpwAnAp4Cl\nkirXbFsiYnlEtEi6HpgoaQmpqE4GZhYzoAHuIhXaH0k6j3S99zJgakSsLGK+D4yTdAXwQ+AI4LOk\nWc4VE4GbJM0GHgLOArYAbtgQP9zMzHq33AX4tOL7vqr2k4Ebi7/PBtYAt5EmPTUDX60ERsRqSccC\n15BGq28BNwHfKsUskHQM6Z7iM0nXa78cEc2lmJ9I2oZ0//AQYC4wqmp2tZmZWUPkvg9Y7YhZAYwt\nPuuKeR74+Hq2cx/wkfXETAGmrC8nMzOzzuoSk7DMzMx6GxdgMzOzDFyAzczMMnABNjMzy8AF2MzM\nLAMXYDMzswxcgM3MzDJwATYzM8vABdjMzCwDF2AzM7MMXIDNzMwycAE2MzPLwAXYzMwsg9yvIzQz\ns25syrm/zJ1CFuO+84lOb8MjYDMzswxcgM3MzDJwATYzM8vA14A3oqFfuzl3ClnMufILuVMwM+ty\nPAI2MzPLwAXYzMwsAxdgMzOzDFyAzczMMnABNjMzy8AF2MzMLAMXYDMzswxcgM3MzDJwAa4iaayk\n5yStkDRL0gG5czIzs57HBbhE0vHAROASYF/gMaBZ0qCsiZmZWY/jAry2c4BrI+KGiJgHnAq8DXwx\nb1pmZtbT+FnQBUmbAUOBCZW2iFgjaTowrEZ8P6Bfqak/QGtr6zr3sXrl8kal2620dUzWZ+mK1Q3M\npPvozDEDeHf5uw3KpHvpzHF7610fs3osX/l2gzLpXto6bu09poqIRuXTrUnaFngZODgiZpbarwAO\ni4gDq+IvBi7aqEmamVl3sl1EvLyuTo+A6zeBdL24bGtgSYZc1qc/8BKwHbA0cy7dhY9ZfXzcOs7H\nrD5d/bj1Bxa2FeAC/J7XgNXA4Kr2wcCi6uCIWAmsrGru3LmcDURS5c+lEdElc+xqfMzq4+PWcT5m\n9ekGx229OXkSViEiVgFzgBGVNkl9iuWZ61rPzMysHh4Br20icJOk2cBDwFnAFsANWbMyM7MexwW4\nJCJ+ImkbYDwwBJgLjIqIxXkz67SVpHubq0+Z27r5mNXHx63jfMzq0+2Pm2dBm5mZZeBrwGZmZhm4\nAJuZmWXgAmxmZpaBC7CZmVkGLsA9nF+v2HGShkv6paSFkkLS6Nw5dXWSLpD0sKSlkl6VNE3Szrnz\n6soknSbpcUmtxWempI/lzqs7kfT14r/RSblzqYcLcA/m1yvWbQvSsRqbO5Fu5DBgKnAQMBLYFLhL\n0hZZs+raXgK+TnoJzH7APcAvJO2eNatuQtL+wN8Bj+fOpV6+DakHkzQLeDgixhXLfYAXgckRcXnW\n5LoJSQEcFxHTcufSnRT3079KepHJ/bnz6S4kLQG+FhHX586lK5O0JfAI8FXgQmBuRJyVN6uO8wi4\nhyq9XnF6pS0i1hTLf/J6RbMGG1B8d8WXk3Q5kvpK+hzp7Isffbt+U4HbI2L6eiO7MD8Jq+f6P0Bf\noPopXouBXTZ+OtZbFGdaJgEzIuLJ3Pl0ZZL2JBXczYFlpLMt8/Jm1bUV/1DZF9g/dy6d5QJsZo02\nFdgDODR3It3A08A+pDMGnyY9i/4wF+HaJG0PXA2MjIgVufPpLBfgnqtDr1c0awRJU4BjgeER8VLu\nfLq64i1szxaLc4qJRWeSJhfZnxoKDAIeKb2OsC8wXNI4oF9ErM6VXEf5GnAP5dcr2sakZApwHHBE\nRCzInVM31QfolzuJLuxuYE/SWYPKZzZwC7BPdyq+4BFwT+fXK9ahmGH54VLTTpL2AZZExAuZ0urq\npgInAJ8ClkoaUrS3RMTyfGl1XZImAL8GXgD6k47f4cDRGdPq0iJiKbDWvAJJbwGvd8f5Bi7APVgP\nfr3ihrYfcG9peWLxfRMwZqNn0z2cVnzfV9V+MnDjRs2k+xgE3Ax8EGgh3c96dET8JmtWttH4PmAz\nM7MMfA3YzMwsAxdgMzOzDFyAzczMMnABNjMzy8AF2MzMLAMXYDMzswxcgM3MzDJwATbrRSQdLikk\nDcydy8YiaYykNxuwnZA0uhE5mYELsNlGJ2kbSddIekHSSkmLJDVLOqTB+7lP0qSq5gd478lLWUm6\nUdK0RsWZdTd+FKXZxncbsBlwEvAH0huqRgAf2NA7Ll7S4bdhmXUBHgGbbUTFqd+PAudHxL0R8XxE\nPBQREyLi/5fjJF0n6Y+SWiXdI2nvUv/FkuZK+ltJz0lqkXSrpP5F/43AYcCZxanTkLRj9SnoyulZ\nScdKelrS25J+Kun9kk4qtv2GpO9J6lvafz9JV0l6WdJbkmZJOrzUX9nu0ZLmS1om6U5JH6zkT/oH\nyKdK+f3v+h08pudIeqLI40VJ/1y8UKM6brSkZyStKM44bF/V/ylJjxT9f5B0kSQPUmyDcQE227iW\nFZ/Rktp67dx/kB7W/zHSO1AfAe6WtHUp5s+B0aT37x5LKrhfL/rOJL128lrSKecPAi+uY1/vB84A\nPgeMIr2R5+fAx4vP35LeT/vp0jpTgGHFOnsV+d4p6S+qtvv3xfrDgR2Aq4q+q4B/B+4s5fdAG8ej\nLWuK/HcnFfUjgCtq/MZvAF8ADgEGArdWOiV9lPRihKuB3YrfO6ZYx2zDiAh//PFnI36AvwaWAMuB\nGcC3gb1K/YeSrtH2q1rvWeCU4u+LgbeA/qX+K4AHS8v3AZOqtnE4EMDAYnlMsfznpZjvF9vestR2\nJ/D94u8dgHeBbau2PR34dhvb/SqwqLR8IzCtHcerXXGl+E8Dr5WWK7kcWGrbpWg7oJT7BVXb+Tyw\nsLQcwOjc///xp+d8fHrFbCOLiNsk3U46FX0QaZR7nqQvR8SNwN7AlsDrksqrvo806q14LtL7USte\nIY2aO+rtiPh9aXlxse1lVW2Vbe8J9AV+V5VfP+D1NrZbb35tknQkcAGpqDaR5rZsLun9EfF2EfYu\n8HBlnYh4qpgZvSvpXdl7A4dIKo94+9bYjlnDuACbZRARK4DfFJ9LJV0HXEIa7W1JKlaH11i1fDvN\nO9Wbpb7LSrW209a2twRWk06Nr66KKxftWtsQDSRpR+BXwDWk08VLSGcQridNdGtv4dwSuAj4WY2+\nFZ3N06wWF2CzrmEe6XoupOu9Q4B3I+K5TmxzFWkU12iPFtsdFBH/1YntNCK/oaR/GJwbEWsAJH22\nRtwmwH6k0S6SdiZdB55f9D8C7BwRz3YyH7N2cwE224gkfYA0YemHwOPAUlJhOA/4RRE2nTSBapqk\n84DfAdsCxwA/j4jZ7dzdc8CBxShxGWl02GkR8TtJtwA3SzqXVJC3Id1K9XhE3N6B/I4uiuHrQEtE\nVI+aKwZI2qeq7XXSdfFNgdMl/ZI0werUGuu/A0yWdAbpdPQU0vXyh4r+8cCvJL0A/JQ0sWtvYI+I\nuLCdv8esQzwL2mzjWgbMAs4G7geeBC4lzVYeBxARQZp9fD9wA6kA3wr8GelabHtdRTpFPA/4I2ny\nVKOcTJo1/B3gaWAasD/wQge2cW2x7uwiv7YeRHI4qdCXPxdFxGPAOcD5pGN5Iul6cLW3gX8Cfkya\n+LYMOL7SGRHNpJnkR5GuFT9I+t/o+Q78HrMOUfpv3czMzDYmj4DNzMwycAE2MzPLwAXYzMwsAxdg\nMzOzDFyAzczMMnABNjMzy8AF2MzMLAMXYDMzswxcgM3MzDJwATYzM8vABdjMzCwDF2AzM7MM/gcO\nVZ8XTiMqJQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Set up graph\n", + "fig, ax = plt.subplots(1, 1, dpi = 100, figsize = (5, 2))\n", + "\n", + "# Get data\n", + "sentiment_labels = train['Sentiment'].value_counts().index\n", + "sentiment_count = train['Sentiment'].value_counts()\n", + "\n", + "# Plot graph\n", + "sns.barplot(x = sentiment_labels, y = sentiment_count)\n", + "\n", + "# Plot labels\n", + "ax.set_ylabel('Count') \n", + "ax.set_xlabel('Sentiment Label')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The most simple analysis shows that we have 8529 review sentences divided into more than 156 thousands of phrases. Majority of reviews turn out to be \"neutral\". Our data demonstrates clear imbalance in classes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I am going to use oversampling of minority classes in order to cope with sample imbalance." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Using simple oversampling repeating the observations from minority classes\n", + "sent_2 = train[train['Sentiment']==2]\n", + "#we will copy class 0 11 times\n", + "sent_0 = train[train['Sentiment']==0]\n", + "#we will copy class 1 2 times\n", + "sent_1 = train[train['Sentiment']==1]\n", + "#we will copy class 3 2 times\n", + "sent_3 = train[train['Sentiment']==3]\n", + "#we will copy class 4 8 times\n", + "sent_4 = train[train['Sentiment']==4]\n", + "\n", + "train_b = sent_2\n", + "for x in [sent_0,sent_1,sent_3,sent_4]:\n", + " train_b = train_b.append([x]*int(len(sent_2)/len(x)))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAADWCAYAAAAJgFGRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAbGklEQVR4nO3de7xWZZn/8c8XVCzloP0EMnXyN814\nPuIJD+iAKKUV/abS0Saxg4OB50lzMg/ohKkRBoxNamqNjY3Z0C81t+EhZxRRUDwEmhYeEUxRNiig\nwjV/3OsZF0/P3uz97Afuffi+X6/1eva672utda1Vvi7WWvdaSxGBmZmZbVi9cidgZmbWE7kAm5mZ\nZeACbGZmloELsJmZWQYuwGZmZhm4AJuZmWXgAmxmZpaBC7CZmVkGG+VOoLuQJGBrYFnuXMzMLLu+\nwMJo5W1XLsCNszXwUu4kzMys09gGeLmlThfgxlkG8OKLL9KvX7/cuZiZWSbNzc1su+22sI4rolkL\nsKTewIXAF4DBwELgeuCSyml7cWn3IuCrwADgfuDkiHimtJ4tgSnAJ4E1wC3AaRGxvBSzOzAN2Bf4\nEzAlIi6ryudzwMXAR4FngHMi4vb27FO/fv1cgM3MbJ1yD8I6BzgZGA/sVMyfDZxSijkbOBUYC+wP\nvAU0Sdq0FHMjsAswEjgaGAb8sNIpqR9wJ/A8MAT4OnChpJNKMQcC/w5cC+wFTAemS9q1cbtrZmaW\nKOfXkCTdCiyOiC+X2m4BVkTEF4qz34XAdyPiiqK/P7AYGBMRN0naCZgH7BsRs4uYUcDtwDYRsVDS\nycA/A4Mj4p0i5lJgdETsWMz/DNgsIo4u5fIgMDcixrZhX/oBS5cuXeozYDOzHqy5uZn+/fsD9I+I\n5pbicp8BPwCMkPTXAJL2AA4Gfl30b0+6ND2jskBELAVmAUOLpqHAm5XiW5hBuhS9fynmvkrxLTQB\nO0jaohQzg7U1lbazFkl9JPWrTKQRb2ZmZm2SexDWpUA/4ClJq4HewDcj4saif3Dxu7hqucWlvsHA\nq+XOiHhP0pKqmAU11lHpe6P4bW071c4FLmihz8zMrFW5C/DngeOB44DfAXsCkyUtjIgbsma2bhOB\nSaX5vvgxJOskDppyUO4Usrj/lPtzp2DWZrkL8OXApRFxUzH/hKS/IJ1d3gAsKtoHAa+UlhsEzC3+\nXgQMLK9U0kbAlqXlFxXLlA0q9bUWs4gaImIVsKq0zVphZmZmNeW+B/xB0r3astW8n9cCUgEcUeks\n7rfuD8wsmmYCAyQNKa1jeLGOWaWYYZI2LsWMBJ6OiDdKMSNY28jSdszMzBom9xnwr4BvSnqBdAl6\nL+BM4EcAERGSJgPnSXqGVJAvJo2Mnl7EzJd0B3C1pLHAxsBU4KaIWFhs56ek+7XXSvoOsCtwGnBG\nKZcrgd9KOgu4DTgW2Ac4iQYZ8vUfN2pVXcqcy7+YOwUzs04ndwE+hVRQ/4V0GXkh8K/AhFLMZcBm\npOd6BwD/DYyKiJWlmONJRfcu3n8Rx6mVzohYKukI0os45gCvARMi4oelmAckHQdcAnyb9CKO0RHx\nZCN32MzMDDIX4IhYBpxeTC3FBHB+MbUUs4Q0kKu1bT0OHLKOmJuBm1uLMTMza4Tc94DNzMx6JBdg\nMzOzDFyAzczMMnABNjMzy8AF2MzMLIPcjyGZmVkXNvWsX+VOIYvx3/1kh9fhM2AzM7MMXIDNzMwy\ncAE2MzPLwAXYzMwsAxdgMzOzDFyAzczMMnABNjMzy8AF2MzMLAMXYDMzswxcgM3MzDJwATYzM8vA\nBdjMzCwDF2AzM7MM/DUkMzPgt8MOzZ1CFofe99vcKfRYPgM2MzPLwAXYzMwsAxdgMzOzDFyAzczM\nMnABNjMzyyB7AZb0EUn/Jul1SSskPSFpn1K/JE2Q9ErRP0PSX1WtY0tJN0pqlvSmpGslbV4Vs7uk\n/5K0UtKLks6ukcvnJD1VxDwh6RPrb8/NzKwny1qAJW0B3A+8C3wc2Bk4C3ijFHY2cCowFtgfeAto\nkrRpKeZGYBdgJHA0MAz4YWk7/YA7geeBIcDXgQslnVSKORD4d+BaYC9gOjBd0q6N22MzM7Mk93PA\n5wAvRsSJpbYFlT8kCTgduCQiflm0fRFYDIwGbpK0EzAK2DciZhcxpwC3S/rHiFgIHA9sAnwpIt4B\nfidpT+BM3i/UpwF3RMTlxfy3JI0ExpOKv5mZWcPkvgT9KWC2pJslvSrpUUlfLfVvDwwGZlQaImIp\nMAsYWjQNBd6sFN/CDGAN6Yy5EnNfUXwrmoAdirPwSswM1tZU2s5aJPWR1K8yAX3btstmZmb5C/D/\nBU4GngGOBK4Cvi/phKJ/cPG7uGq5xaW+wcCr5c6IeA9YUhVTax20IWYwtZ0LLC1NL7UQZ2Zm9mdy\nF+BewCMR8U8R8WhE/BC4mq5xyXci0L80bZM3HTMz60pyF+BXgHlVbfOB7Yq/FxW/g6piBpX6FgED\ny52SNgK2rIqptQ7aELOIGiJiVUQ0VyZgWa04MzOzWnIX4PuBHara/po0WhnSgKxFwIhKZ3G/dX9g\nZtE0ExggaUhpHcNJ+zarFDNM0salmJHA0xHxRilmBGsbWdqOmZlZw+QuwN8DDpD0T5I+Juk44CRg\nGkBEBDAZOE/SpyTtBvwYWEh6TIiImA/cAVwtaT9JBwFTgZuKEdAAPwXeAa6VtIukY0ijnieVcrkS\nGCXpLEk7SroQ2KdYl5mZWUNlLcAR8TDwGeDvgCeBbwGnR8SNpbDLgCmkx4UeBjYHRkXEylLM8cBT\nwF3A7cB/kwp5ZTtLgSNIo6rnAN8FJhT3nCsxDwCVfwA8BnwWGB0RTzZwl83MzID8zwETEbcCt7bS\nH8D5xdRSzBJS8WxtO48Dh6wj5mbg5tZizMzMGiH3JWgzM7MeyQXYzMwsAxdgMzOzDLLfAzZrzQsT\ndsudQhbbnf9E7hTMbD3zGbCZmVkGLsBmZmYZuACbmZll4AJsZmaWQV0FWNIfJX2oRvsASX/seFpm\nZmbdW71nwB8Fetdo7wN8pO5szMzMeoh2PYYk6VOl2SMlLS3N9yZ9Tei5BuRlZmbWrbX3OeDpxW8A\nN1T1vUsqvmd1MCczM7Nur10FOCJ6AUhaAOwbEa+tl6zMzMy6ubrehBUR2zc6ETMzs56k7ldRShpB\nuuc7kKrBXBHxpQ7mZWZm1q3VVYAlXUD6Pu9s4BXSPWEzMzNro3rPgMcCYyLiJ41MxszMrKeo9zng\nTYAHGpmImZlZT1JvAb4GOK6RiZiZmfUk9V6C3hQ4SdLhwOOkZ4D/V0Sc2dHEzMzMurN6C/DuwNzi\n712r+jwgy8zMbB3qfQ74bxqdiJmZWU/izxGamZllUO9zwPfQyqXmiBhed0ZmZmY9QL33gOdWzW8M\n7Em6H1z9kQYzMzOrUtcl6Ig4o2oaHxEHA5OpGhHdVpK+ISkkTS61bSppmqTXJS2XdIukQVXLbSfp\nNklvS3pV0uWSNqqKOUzSI5JWSXpW0pga2x8n6TlJKyXNkrRfPfthZmbWFo2+B/xvQLvfAy1pX+Af\nSI80lX0P+CTwOeBQYGvgF6XlegO3kV4MciBwAjAGmFCK2b6IuYd0lj4ZuEbSkaWYY4BJwEXA3sBj\nQJOkge3dFzMzs7ZodAEeCqxszwKSNgduBL4KvFFq7w98GTgzIu6OiDnAicCBkg4owo4Adga+EBFz\nI+LXwLeAcZI2KWLGAgsi4qyImB8RU4GfA2eU0jgTuDoirouIecUyb1PHPybMzMzaot5BWL+obgI+\nDOwDXNzO1U0DbouIGZLOK7UPId1bnlFpiIinJL1AKvQPFr9PRMTi0nJNwFXALsCjRcwM1tZEOhOm\nKNRDgIml7ayRNKNYtiZJfYA+paa+bdpbMzMz6h+EtbRqfg3wNHB+RNzZ1pVIOpZ0yXffGt2DgXci\n4s2q9sVFXyVmcY1+2hDTT9IHgC2A3i3E7NhK+ucCF7TSb2Zm1qJ6X8RxYkc3LGlb4EpgZES067J1\nJzGRdN+4oi/wUqZczMysi6n3DBgASUOAnYrZ30XEo+1YfAgwEHhEUqWtNzBM0njgSGATSQOqzoIH\nAYuKvxcB1aOVB5X6Kr+DasQ0R8QKSauB1S3ELKIFEbEKWFWZL+2DmZnZOtU1CEvSQEl3Aw8D3y+m\nOZLukrRVG1dzF7AbaWRyZZpNGpBV+ftdYERpuzsA2wEzi6aZwG5Vo5VHAs3AvFLMCNY2srKOiHgH\nmFO1nV7F/EzMzMzWg3rPgKeQLrnuEhHzASTtTHoJx/eBv1vXCiJiGfBkuU3SW8DrEfFkMX8tMEnS\nElJRnQLMjIgHi0XuJBXan0g6m3S/9xJgWnGGCvADYLyky4AfAcOBzwNHlTY9CbhB0mzgIeB0YDPg\nujYfETMzs3aotwCPAg6vFF+AiJgnaRypKDbKGaQBXreQRhw3AV8rbXO1pKNJo55nAm+R/hFwfilm\ngaSjSM8Un0a6T/uViGgqxfysOHOfQCric4FRVaOrzczMGqbeAtyL2m+8epcOPFscEYdVza8ExhVT\nS8s8D3xiHeu9F9hrHTFTgaltTNXMzKxD6i2WdwNXStq60iDpI6SzzLsakZiZmVl3Vm8BHg/0A56T\n9AdJfwAWFG2nNCo5MzOz7qre54BflLQ3cDjvv6xifkRUv3HKzMzMamjXGbCk4ZLmSeoXyW8iYkpE\nTAEelvQ7SYesp1zNzMy6jfZegj6d9NGC5uqOiFgK/CvpwwZmZmbWivYW4D2AO1rpv5P0hiszMzNr\nRXsL8CBqP35U8R7Q1jdhmZmZ9VjtLcAvA7u20r878Er96ZiZmfUM7S3AtwMXS9q0uqP4tN9FwK2N\nSMzMzKw7a+9jSJcA/w/4vaSppG8AQ3oUaRzpa0b/3Lj0zMzMuqd2FeCIWCzpQNK7lycClW/wBek9\nzeP8/mQzM7N1a/eLOCrvXpa0BfAxUhF+JiLeaHRyZmZm3VW9H2OgKLgPNzAXMzOzHqPuLxeZmZlZ\n/VyAzczMMnABNjMzy8AF2MzMLAMXYDMzswxcgM3MzDJwATYzM8vABdjMzCwDF2AzM7MMXIDNzMwy\ncAE2MzPLwAXYzMwsg6wFWNK5kh6WtEzSq5KmS9qhKmZTSdMkvS5puaRbJA2qitlO0m2S3i7Wc7mk\njapiDpP0iKRVkp6VNKZGPuMkPSdppaRZkvZbLztuZmY9Xu4z4EOBacABwEhgY+BOSZuVYr4HfBL4\nXBG/NfCLSqek3sBtwCbAgcAJwBhgQilm+yLmHmBPYDJwjaQjSzHHAJOAi4C9gceAJkkDG7nDZmZm\n0IHPETZCRIwqzxdnpa8CQ4D7JPUHvgwcFxF3FzEnAvMlHRARDwJHADsDh0fEYmCupG8B35F0YUS8\nA4wFFkTEWcWm5ks6GDgDaCrazgSujojriu2MBY4CvgRcun6OgJmZ9VS5z4Cr9S9+lxS/Q0hnxTMq\nARHxFPACMLRoGgo8URTfiiagH7BLKWYGa2uqrEPSJsW2yttZU8wPxczMrMGyngGXSepFujR8f0Q8\nWTQPBt6JiDerwhcXfZWYxTX6aUNMP0kfALYAercQs2ML+fYB+pSa+taKMzMzq6UznQFPA3YFjs2d\nSBudCywtTS/lTcfMzLqSTlGAJU0Fjgb+JiLKhWwRsImkAVWLDCr6KjGDavTThpjmiFgBvAasbiFm\nEbVNJF0yr0zbtBBnZmb2Z3I/hqSi+H4GGB4RC6pC5gDvAiNKy+wAbAfMLJpmArtVjVYeCTQD80ox\nI1jbyMo6ioFac6q206uYn0kNEbEqIporE7CsTTttZmZG/nvA04DjgE8DyyRV7tkujYgVEbFU0rXA\nJElLSEV1CjCzGAENcCep0P5E0tmk+72XANMiYlUR8wNgvKTLgB8Bw4HPk0Y5V0wCbpA0G3gIOB3Y\nDLhufey4mZn1bLkL8MnF771V7ScC1xd/nwGsAW4hDXpqAr5WCYyI1ZKOBq4ina2+BdwAnF+KWSDp\nKNIzxaeR7td+JSKaSjE/k7QV6fnhwcBcYFTV6GozM7OGyP0csNoQsxIYV0wtxTwPfGId67kX2Gsd\nMVOBqevKyczMrKM6xSAsMzOznsYF2MzMLAMXYDMzswxcgM3MzDJwATYzM8vABdjMzCwDF2AzM7MM\nXIDNzMwycAE2MzPLwAXYzMwsAxdgMzOzDFyAzczMMnABNjMzy8AF2MzMLAMXYDMzswxcgM3MzDJw\nATYzM8vABdjMzCwDF2AzM7MMXIDNzMwycAE2MzPLwAXYzMwsAxdgMzOzDFyAzczMMnABNjMzy8AF\n2MzMLAMX4CqSxkl6TtJKSbMk7Zc7JzMz635cgEskHQNMAi4C9gYeA5okDcyamJmZdTsuwGs7E7g6\nIq6LiHnAWOBt4Et50zIzs+5mo9wJdBaSNgGGABMrbRGxRtIMYGiN+D5An1JTX4Dm5uYWt7F61YpG\npdultHZM1mXZytUNzKTr6MgxA3hvxXsNyqRr6chxe+s9H7N6rFj1doMy6VpaO25tPaaKiEbl06VJ\n2hp4GTgwImaW2i8DDo2I/aviLwQu2KBJmplZV7JNRLzcUqfPgOs3kXS/uGxLYEmGXNalL/ASsA2w\nLHMuXYWPWX183NrPx6w+nf249QUWthbgAvy+14DVwKCq9kHAourgiFgFrKpq7ti1nPVEUuXPZRHR\nKXPsbHzM6uPj1n4+ZvXpAsdtnTl5EFYhIt4B5gAjKm2SehXzM1tazszMrB4+A17bJOAGSbOBh4DT\ngc2A67JmZWZm3Y4LcElE/EzSVsAEYDAwFxgVEYvzZtZhq0jPNldfMreW+ZjVx8et/XzM6tPlj5tH\nQZuZmWXge8BmZmYZuACbmZll4AJsZmaWgQuwmZlZBi7A3Zw/r9h+koZJ+pWkhZJC0ujcOXV2ks6V\n9LCkZZJelTRd0g658+rMJJ0s6XFJzcU0U9LHc+fVlUj6RvHf6OTcudTDBbgb8+cV67YZ6ViNy51I\nF3IoMA04ABgJbAzcKWmzrFl1bi8B3yB9BGYf4G7gl5J2yZpVFyFpX+AfgMdz51IvP4bUjUmaBTwc\nEeOL+V7Ai8CUiLg0a3JdhKQAPhMR03Pn0pUUz9O/SvqQyX258+kqJC0Bvh4R1+bOpTOTtDnwCPA1\n4DxgbkScnjer9vMZcDdV+rzijEpbRKwp5v/s84pmDda/+O2MHyfpdCT1lnQs6eqLX327btOA2yJi\nxjojOzG/Cav7+j9Ab6D6LV6LgR03fDrWUxRXWiYD90fEk7nz6cwk7UYquJsCy0lXW+blzapzK/6h\nsjewb+5cOsoF2MwabRqwK3Bw7kS6gKeBPUlXDD5Lehf9oS7CtUnaFrgSGBkRK3Pn01EuwN1Xuz6v\naNYIkqYCRwPDIuKl3Pl0dsVX2J4tZucUA4tOIw0usj83BBgIPFL6HGFvYJik8UCfiFidK7n28j3g\nbsqfV7QNSclU4DPA8IhYkDunLqoX0Cd3Ep3YXcBupKsGlWk2cCOwZ1cqvuAz4O7On1esQzHC8mOl\npu0l7QksiYgXMqXV2U0DjgM+DSyTNLhoXxoRK/Kl1XlJmgj8GngB6Es6focBR2ZMq1OLiGXAWuMK\nJL0FvN4Vxxu4AHdj3fjziuvbPsA9pflJxe8NwJgNnk3XcHLxe29V+4nA9Rs0k65jIPBj4MPAUtLz\nrEdGxG+yZmUbjJ8DNjMzy8D3gM3MzDJwATYzM8vABdjMzCwDF2AzM7MMXIDNzMwycAE2MzPLwAXY\nzMwsAxdgsx5E0mGSQtKA3LlsKJLGSHqzAesJSaMbkZMZuACbbXCStpJ0laQXJK2StEhSk6SDGryd\neyVNrmp+gPffvJSVpOslTW9UnFlX41dRmm14twCbACcAfyR9oWoE8KH1veHiIx3+GpZZJ+AzYLMN\nqLj0ewhwTkTcExHPR8RDETExIv5/OU7SNZL+JKlZ0t2S9ij1XyhprqS/l/ScpKWSbpLUt+i/HjgU\nOK24dBqSPlp9CbpyeVbS0ZKelvS2pJ9L+qCkE4p1vyHp+5J6l7bfR9IVkl6W9JakWZIOK/VX1nuk\npPmSlku6Q9KHK/mT/gHy6VJ+/7t8O4/pmZKeKPJ4UdK/FB/UqI4bLekZSSuLKw7bVvV/WtIjRf8f\nJV0gyScptt64AJttWMuLabSk1j47dzPpZf0fJ30D9RHgLklblmL+EhhN+v7u0aSC+42i7zTSZyev\nJl1y/jDwYgvb+iBwKnAsMIr0RZ7/BD5RTH9P+j7tZ0vLTAWGFsvsXuR7h6S/qlrvPxbLDwO2A64o\n+q4A/gO4o5TfA60cj9asKfLfhVTUhwOX1djHbwJfBA4CBgA3VTolHUL6MMKVwM7F/o4pljFbPyLC\nkydPG3AC/hZYAqwA7ge+Dexe6j+YdI+2T9VyzwInFX9fCLwF9C31XwY8WJq/F5hctY7DgAAGFPNj\nivm/LMX8oFj35qW2O4AfFH9vB7wHbF217hnAt1tZ79eARaX564HpbThebYorxX8WeK00X8ll/1Lb\njkXbfqXcz61azxeAhaX5AEbn/v+Pp+4z+fKK2QYWEbdIuo10KfoA0lnu2ZK+EhHXA3sAmwOvSyov\n+gHSWW/Fc5G+j1rxCumsub3ejog/lOYXF+teXtVWWfduQG/g91X59QFeb2W99ebXKkmHA+eSimo/\n0tiWTSV9MCLeLsLeAx6uLBMRTxUjo3cifSt7D+AgSeUz3t411mPWMC7AZhlExErgN8V0saRrgItI\nZ3ubk4rVYTUWLT9O8271aqnvtlKt9bS27s2B1aRL46ur4spFu9Y6RANJ+ihwK3AV6XLxEtIVhGtJ\nA93aWjg3By4AflGjb2VH8zSrxQXYrHOYR7qfC+l+72DgvYh4rgPrfId0FtdojxbrHRgR/9WB9TQi\nvyGkfxicFRFrACR9vkbcRsA+pLNdJO1Aug88v+h/BNghIp7tYD5mbeYCbLYBSfoQacDSj4DHgWWk\nwnA28MsibAZpANV0SWcDvwe2Bo4C/jMiZrdxc88B+xdnictJZ4cdFhG/l3Qj8GNJZ5EK8lakR6ke\nj4jb2pHfkUUxfB1YGhHVZ80V/SXtWdX2Oum++MbAKZJ+RRpgNbbG8u8CUySdSrocPZV0v/yhon8C\ncKukF4CfkwZ27QHsGhHntXF/zNrFo6DNNqzlwCzgDOA+4EngYtJo5fEAERGk0cf3AdeRCvBNwF+Q\n7sW21RWkS8TzgD+RBk81yomkUcPfBZ4GpgP7Ai+0Yx1XF8vOLvJr7UUkh5EKfXm6ICIeA84EziEd\ny+NJ94OrvQ18B/gpaeDbcuCYSmdENJFGkh9Bulf8IOl/o+fbsT9m7aL037qZmZltSD4DNjMzy8AF\n2MzMLAMXYDMzswxcgM3MzDJwATYzM8vABdjMzCwDF2AzM7MMXIDNzMwycAE2MzPLwAXYzMwsAxdg\nMzOzDFyAzczMMvgfd1asA6FkZikAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Set up graph\n", + "fig, bx = plt.subplots(1, 1, dpi = 100, figsize = (5, 2))\n", + "\n", + "# Get data\n", + "sentiment_labels_b = train_b['Sentiment'].value_counts().index\n", + "sentiment_count_b = train_b['Sentiment'].value_counts()\n", + "\n", + "# Plot graph\n", + "sns.barplot(x = sentiment_labels_b, y = sentiment_count_b)\n", + "\n", + "# Plot labels\n", + "bx.set_ylabel('Count') \n", + "bx.set_xlabel('Sentiment Label')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the classes are more or less balanced." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, I would suggest dividing the training set into train and validation ones, because test set does not have Sentiment column." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X = train_b.drop('Sentiment',axis = 1)\n", + "y = train_b['Sentiment']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classics " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer(stop_words=\"english\", ngram_range=(1, 2))\n", + "training_features = vectorizer.fit_transform(X_train[\"Phrase\"]) \n", + "test_features = vectorizer.transform(X_test[\"Phrase\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Building classification pipelines\n", + "\n", + "pipeline2 = Pipeline([('vect', vectorizer),\n", + " ('clf', LogisticRegression(multi_class='ovr', solver='sag', random_state=42))])\n", + "\n", + "pipeline3 = Pipeline([('vect', vectorizer),\n", + " ('clf', LinearSVC(multi_class='ovr', random_state=42))])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "model_LR = pipeline2.fit(X_train['Phrase'],y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "model_SVC = pipeline3.fit(X_train['Phrase'],y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.80 0.96 0.87 23203\n", + " 1 0.70 0.44 0.54 16399\n", + " 2 0.65 0.73 0.69 23861\n", + " 3 0.66 0.50 0.57 19778\n", + " 4 0.78 0.90 0.83 22186\n", + "\n", + " accuracy 0.73 105427\n", + " macro avg 0.72 0.71 0.70 105427\n", + "weighted avg 0.72 0.73 0.71 105427\n", + "\n", + "0.713809140682\n" + ] + } + ], + "source": [ + "print(classification_report(y_test,model_LR.predict(X_test['Phrase'])))\n", + "print(f1_score(y_test,model_LR.predict(X_test['Phrase']),average='weighted'))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.85 0.99 0.91 23203\n", + " 1 0.74 0.60 0.66 16399\n", + " 2 0.72 0.69 0.71 23861\n", + " 3 0.72 0.61 0.66 19778\n", + " 4 0.83 0.95 0.89 22186\n", + "\n", + " accuracy 0.78 105427\n", + " macro avg 0.77 0.77 0.77 105427\n", + "weighted avg 0.77 0.78 0.77 105427\n", + "\n", + "0.774277554985\n" + ] + } + ], + "source": [ + "print(classification_report(y_test,model_SVC.predict(X_test['Phrase'])))\n", + "print(f1_score(y_test,model_SVC.predict(X_test['Phrase']),average='weighted'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "LinearSVC model performs a little bit better than Logistic regression. Random forest strangely was too slow even for google engine." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model tuning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's tune the best of two models - LinearSVC" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "parameters3 = {'clf__multi_class':['ovr','crammer_singer'],\n", + " 'clf__C': [0.01, 1, 100]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing grid search...\n", + "pipeline: ['vect', 'clf']\n", + "parameters:\n", + "{'clf__C': [0.01, 1, 100], 'clf__multi_class': ['ovr', 'crammer_singer']}\n", + "Fitting 3 folds for each of 6 candidates, totalling 18 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n", + "Exception in thread QueueManagerThread:\n", + "Traceback (most recent call last):\n", + " File \"/jet/var/python/lib/python3.6/threading.py\", line 916, in _bootstrap_inner\n", + " self.run()\n", + " File \"/jet/var/python/lib/python3.6/threading.py\", line 864, in run\n", + " self._target(*self._args, **self._kwargs)\n", + " File \"/jet/var/python/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py\", line 747, in _queue_management_worker\n", + " recursive_terminate(p)\n", + " File \"/jet/var/python/lib/python3.6/site-packages/joblib/externals/loky/backend/utils.py\", line 28, in recursive_terminate\n", + " _recursive_terminate_without_psutil(process)\n", + " File \"/jet/var/python/lib/python3.6/site-packages/joblib/externals/loky/backend/utils.py\", line 53, in _recursive_terminate_without_psutil\n", + " _recursive_terminate(process.pid)\n", + " File \"/jet/var/python/lib/python3.6/site-packages/joblib/externals/loky/backend/utils.py\", line 107, in _recursive_terminate\n", + " _recursive_terminate(cpid)\n", + " File \"/jet/var/python/lib/python3.6/site-packages/joblib/externals/loky/backend/utils.py\", line 94, in _recursive_terminate\n", + " stderr=None\n", + " File \"/jet/var/python/lib/python3.6/subprocess.py\", line 336, in check_output\n", + " **kwargs).stdout\n", + " File \"/jet/var/python/lib/python3.6/subprocess.py\", line 418, in run\n", + " output=stdout, stderr=stderr)\n", + "subprocess.CalledProcessError: Command '['pgrep', '-P', '3442']' died with .\n", + "\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 832\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 833\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 834\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[0;34m(future, timeout)\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLokyTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/concurrent/futures/_base.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 295\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 296\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mpprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparameters3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mt0\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mgrid_search\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Phrase'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"done in %0.3fs\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mt0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 686\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[0;31m# For multi-metric evaluation, store the best_index_, best_params_ and\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[0;34m(self, evaluate_candidates)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1147\u001b[0m \u001b[0;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1148\u001b[0;31m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1149\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[0;34m(candidate_params)\u001b[0m\n\u001b[1;32m 664\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m in product(candidate_params,\n\u001b[0;32m--> 666\u001b[0;31m cv.split(X, y, groups)))\n\u001b[0m\u001b[1;32m 667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 853\u001b[0m \u001b[0;31m# scheduling.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[0mensure_ready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_managed_backend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 855\u001b[0;31m \u001b[0mbackend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabort_everything\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mensure_ready\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_ready\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 856\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTransportableException\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mabort_everything\u001b[0;34m(self, ensure_ready)\u001b[0m\n\u001b[1;32m 536\u001b[0m \"\"\"Shutdown the workers and restart a new one with the same parameters\n\u001b[1;32m 537\u001b[0m \"\"\"\n\u001b[0;32m--> 538\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_workers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshutdown\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkill_workers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 539\u001b[0m \u001b[0mdelete_folder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_workers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_temp_folder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_workers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py\u001b[0m in \u001b[0;36mshutdown\u001b[0;34m(self, wait, kill_workers)\u001b[0m\n\u001b[1;32m 1094\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1095\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1096\u001b[0;31m \u001b[0mqmt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1097\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1098\u001b[0m \u001b[0mcq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_queue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/threading.py\u001b[0m in \u001b[0;36mjoin\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1055\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1056\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_wait_for_tstate_lock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1057\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1058\u001b[0m \u001b[0;31m# the behavior of a negative timeout isn't documented, but\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/var/python/lib/python3.6/threading.py\u001b[0m in \u001b[0;36m_wait_for_tstate_lock\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m 1070\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlock\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# already determined that the C code is done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1071\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_stopped\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1072\u001b[0;31m \u001b[0;32melif\u001b[0m \u001b[0mlock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1073\u001b[0m \u001b[0mlock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrelease\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1074\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "if __name__ == \"__main__\":\n", + " # multiprocessing requires the fork to happen in a __main__ protected\n", + " # block\n", + "\n", + " # find the best parameters for both the feature extraction and the\n", + " # classifier\n", + " grid_search = GridSearchCV(pipeline3, parameters3, cv=3,\n", + " n_jobs=-1, verbose=1,scoring='f1_weighted')\n", + "\n", + " print(\"Performing grid search...\")\n", + " print(\"pipeline:\", [name for name, _ in pipeline3.steps])\n", + " print(\"parameters:\")\n", + " pprint(parameters3)\n", + " t0 = time()\n", + " grid_search.fit(X_train['Phrase'], y_train)\n", + " print(\"done in %0.3fs\" % (time() - t0))\n", + " print()\n", + "\n", + " print(\"Best score for Linear SVC: %0.3f\" % grid_search.best_score_)\n", + " print(\"Best parameters set:\")\n", + " best_parameters = grid_search.best_estimator_.get_params()\n", + " for param_name in sorted(parameters3.keys()):\n", + " print(\"\\t%s: %r\" % (param_name, best_parameters[param_name]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Embedding way" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I decided to try word2vec, which analyses the words by their surroundings. I have chosen CBOW method, because it requires less computer power and more intuitive from human perspective - identifies the word, by surrounding, not vice versa." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "w2v = gensim.models.Word2Vec(train['Phrase'], size = 200, window = 7, min_count = 40, sample = 0.001)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "w2v.init_sims(replace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/245995 [00:00