Added custom weights support and fixed vocabulary ordering

SauravPattnaikCS60 · Nov 12, 2022 · de45736 · de45736
1 parent f5cf28d
commit de45736
Show file tree

Hide file tree

Showing 9 changed files with 114 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -68,4 +68,10 @@ test_df = wcbtfidf.transform(xtest)
 wcbtfidf.combine_vocab
 # Get the class wise vocab
 wcbtfidf.class_wise_vocab
+
+# Added support for providing custom features set
+wcbtfidf = Wcbtfidf(max_features=100,custom_weights={0:20,1:80}) # This lets you manage how many features you want to assign
+
+# Here xtrain,xtest refers to a single pandas column containing the text data and ytrain ytest the
+# categorical output label
 ```
diff --git a/demos/analysis.ipynb b/demos/analysis.ipynb
@@ -359,8 +359,8 @@
     "    \n",
     "    print('Running base version')\n",
     "    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
-    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
-    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
+    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
+    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
     "    \n",
     "    \n",
     "    model.fit(train_df,ytrain)\n",
@@ -371,7 +371,7 @@
     "    print(classification_report(ytest,preds))\n",
     "    \n",
     "    print('Running my version')\n",
-    "    wcbtfidf = Wcbtfidf(max_features=max_feat)\n",
+    "    wcbtfidf = Wcbtfidf(max_features=max_feat,custom_weights={0:100,1:200})\n",
     "    wcbtfidf.fit(xtrain,ytrain)\n",
     "    \n",
     "    train_df = wcbtfidf.transform(xtrain)\n",
@@ -410,16 +410,16 @@
       "weighted avg       0.88      0.90      0.87    125000\n",
       "\n",
       "Running my version\n",
-      "Precision is 0.9100734976221357\n",
-      "Recall is 0.9916888888888888\n",
-      "ROC curve is 0.5548844444444444\n",
+      "Precision is 0.9081446934364691\n",
+      "Recall is 0.9935022222222222\n",
+      "ROC curve is 0.5445511111111111\n",
       "              precision    recall  f1-score   support\n",
       "\n",
-      "           0       0.61      0.12      0.20     12500\n",
+      "           0       0.62      0.10      0.17     12500\n",
       "           1       0.91      0.99      0.95    112500\n",
       "\n",
       "    accuracy                           0.90    125000\n",
-      "   macro avg       0.76      0.55      0.57    125000\n",
+      "   macro avg       0.76      0.54      0.56    125000\n",
       "weighted avg       0.88      0.90      0.87    125000\n",
       "\n"
      ]
@@ -448,7 +448,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 15,
    "id": "c0b5fa7c",
    "metadata": {},
    "outputs": [
@@ -471,15 +471,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 16,
    "id": "dc686a7e",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['remember', 'hopefully', 'stop', 'tv', 'did', 'kids', 'rock', 'having', '100', 'send', 'came', 'taking', 'tho', 'beach', 'enjoying', 'true', 'seen', 'says', 'just', 'stay', 'lmao', 'kind', 'using', 'mileycyrus', 'saturday', 'idea', 'dad', 'loved', 'outside', 'doing', 'quite', 'plurk', 'crazy', 'ill', 'don', 'room', 'meet', 'watched', 'totally', 'talking', 'ah', 'guy', 'shopping', 'wonderful', 'breakfast', 'end', 'years', 'probably', 'lots', 'till', 'believe', 'fine', 'super', 'fm', 'cause', 'pics', 'hour', 'busy', 'rest', 'mind', 'weeks', 'buy', 'does', 'girls']\n"
+      "['crazy', '100', 'enjoying', 'pics', 'guy', 'idea', 'hour', 'beach', 'leave', 'taking', 'doing', 'talking', 'cause', 'stay', 'aww', 'outside', 'mind', 'fine', 'haven', 'years', 'rock', 'using', 'breakfast', 'doesn', 'ill', 'having', 'busy', 'lmao', 'says', 'remember', 'did', 'plurk', 'probably', 'end', 'weeks', 'room', 'seen', 'car', 'watched', 'bored', 'tv', 'came', 'quite', 'don', 'ah', 'isn', 'dad', 'believe', 'kind', 'tho', 'woke', 'does', 'saturday', 'fm', 'super', 'rest', 'just']\n"
      ]
     }
    ],
@@ -499,15 +499,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 17,
    "id": "56346eb2",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['give', 'stupid', 'may', 'didnt', 'nothing', 'yet', 'always', 'well', 'find', 'sucks', 'ever', 'never', 'keep', 'around', 'another', 'almost', 'ugh', 'cold', 'done', 'many', 'hurts', 'one', 'two', 'back', 'show', 're', 'cant', 'everything', 'something', 'take', 'could', 'would', 'shit', 'put', 'everyone', 'already', 'see', 'even', 'though', 'get', 'might', 'headache', 'still', 'gone', 'first', 'must', 'last', 'go', 'someone', 'since', 'much', 'wanted', 'iphone', 'poor', 'call', 'next', 'also', 'please', 'made', 'least', 'name', 'missing', 'found', 'us']\n"
+      "['least', 'get', 'anything', 'give', 'ever', 'around', 'put', 'anyone', 'might', 'done', 'first', 'always', 'back', 're', 'go', 'last', 'would', 'mine', 'well', 'almost', 'cant', 'keep', 'take', 'ugh', 'please', 'show', 'much', 'may', 'even', 'could', 'nothing', 'someone', 'made', 'find', 'yet', 'next', 'call', 'two', 'must', 'everything', 'also', 'found', 'already', 'see', 'name', 'sucks', 'another', 'still', 'since', 'something', 'us', 'every', 'one', 'everyone', 'many', 'never', 'though']\n"
      ]
     }
    ],

diff --git a/demos/blog_notebook.ipynb b/demos/blog_notebook.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 1,
    "id": "cc4609a5",
    "metadata": {},
    "outputs": [],
@@ -20,6 +20,7 @@
     "\n",
     "import os\n",
     "import sys\n",
+    "\n",
     "module_path = os.path.abspath(os.path.join('..'))\n",
     "if module_path not in sys.path:\n",
     "    sys.path.append(module_path)\n",
@@ -29,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 2,
    "id": "00783250",
    "metadata": {},
    "outputs": [],
@@ -55,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 3,
    "id": "9c95a6f2",
    "metadata": {},
    "outputs": [
@@ -65,8 +66,8 @@
      "text": [
       "Shape of the dataset is (50000, 2)\n",
       "Target distribution is \n",
-      "positive    0.5\n",
       "negative    0.5\n",
+      "positive    0.5\n",
       "Name: sentiment, dtype: float64\n",
       "Final data shape is (25000, 2)\n",
       "Final target distribution is \n",
@@ -82,7 +83,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 4,
    "id": "693f6b13",
    "metadata": {},
    "outputs": [],
@@ -96,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 5,
    "id": "f3a94bcd",
    "metadata": {},
    "outputs": [],
@@ -106,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 6,
    "id": "329ae9b2",
    "metadata": {},
    "outputs": [
@@ -129,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 19,
    "id": "a9a1dd1d",
    "metadata": {},
    "outputs": [],
@@ -138,8 +139,8 @@
     "    \n",
     "    print('Running TFIDF')\n",
     "    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
-    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
-    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
+    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
+    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
     "    \n",
     "    \n",
     "    model.fit(train_df,ytrain)\n",
@@ -162,7 +163,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 20,
    "id": "507a98ca",
    "metadata": {},
    "outputs": [
@@ -184,7 +185,7 @@
       "              precision    recall  f1-score   support\n",
       "\n",
       "           0       0.93      0.99      0.96      5750\n",
-      "           1       0.75      0.19      0.31       500\n",
+      "           1       0.74      0.19      0.30       500\n",
       "\n",
       "    accuracy                           0.93      6250\n",
       "   macro avg       0.84      0.59      0.63      6250\n",
@@ -208,7 +209,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 13,
    "id": "f4ea7bd8",
    "metadata": {},
    "outputs": [
@@ -231,15 +232,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
    "id": "66f9c177",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['blood', 'direction', 'starts', 'annoying', 'stuff', 'god', 'car', 'came', 'episode', 'lost', 'wouldn', 'picture', 'totally', 'sound', 'cheap', 'care', 'save', 'quality', 'stars', 'hell', 'certainly', 'thinking', 'beginning', 'face', 'boy', 'yes', 'flick', 'kill', 'nice', 'truly', 'stop', 'killed', 'hope', 'written', 'attempt', 'moments', 'children', 'don', 'having', 'lead', 'person', 'felt', 'called', 'overall', 'wanted', 'white', 'writing', 'finally', 'evil', 'entire', 'obviously', 'does', 'girls', 'happens', 'turn', 'run', 'just', 'did', 'act', 'lack', 'looked', 'small', 'ridiculous', 'doing', 'gave', 'gore', 'title', 'game', 'example', 'hour', 'ok', 'case', 'playing', 'tries', 'recommend', 'decent', 'style']\n"
+      "['absolutely', 'annoying', 'blood', 'totally', 'run', 'lost', 'hour', 'stuff', 'case', 'simply', 'save', 'waste', 'start', 'does', 'sort', 'starts', 'cheap', 'turn', 'mean', 'game', 'given', 'killed', 'looked', 'attempt', 'getting', 'oh', 'did', 'piece', 'happens', 'understand', 'quality', 'sex', 'small', 'don', 'lack', 'person', 'laugh', 'obviously', 'problem', 'kill', 'god', 'came', 'gore', 'killer', 'flick', 'stop', 'worse', 'having', 'wrong', 'went', 'just', 'girls', 'horrible', 'unfortunately', 'felt', 'decent', 'guess', 'called', 'car', 'tries', 'ok', 'supposed', 'try', 'finally', 'yes', 'use', 'guys', 'face', 'wouldn', 'gave', 'couple', 'production', 'example', 'thinking', 'writing', 'rest', 'children', 'low', 'budget', 'video', 'entire', 'lead', 'dialogue', 'hell', 'sound', 'head', 'couldn', 'wanted', 'direction', 'camera', 'ridiculous', 'doing', 'beginning', 'dead', 'crap', 'care', 'lines', 'act']\n"
      ]
     }
    ],
@@ -251,15 +252,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "id": "11af923d",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['first', 'though', 'get', 'since', 'show', 'take', 'enough', 'although', 'another', 'seem', 'however', 'excellent', 'made', 'done', 'find', 'even', 'loved', 'definitely', 'back', 'go', 'still', 'anything', 'top', 'someone', 'rather', 'perfect', 'one', 'might', 'else', 'see', 'could', 'human', 'must', 'today', 'also', 'well', 'many', 'performances', 'everything', 'two', 'nothing', 'without', 'ever', 'seems', 'part', 'seemed', 'wonderful', 'every', 'less', 'yet', 'amazing', 'us', 'put', 'together', 'anyone', 'either', 'always', 'along', 're', 'become', 'would', 'gives', 'something', 'never', 'almost', 'may', 'give', 'found', 'around', 'next', 'much', 'name', 'everyone', 'last', 'least', 'three', 'whole']\n"
+      "['take', 'give', 'heart', 'name', 'also', 'first', 'never', 'might', 'mother', 'matter', 'several', 'highly', 'perhaps', 'anyone', 'able', 'last', 'drama', 'rather', 'two', 'even', 'nothing', 'another', 'seems', 'see', 'three', 'much', 'top', 'since', 'put', 'part', 'find', 'less', 'cinema', 'one', 'almost', 'ever', 'wants', 'enough', 'us', 're', 'would', 'son', 'human', 'performances', 'together', 'something', 'many', 'however', 'yet', 'definitely', 'gives', 'others', 'always', 'every', 'live', 'around', 'go', 'least', 'get', 'though', 'without', 'wonderful', 'anything', 'keep', 'although', 'favorite', 'still', 'lives', 'show', 'found', 'full', 'loved', 'well', 'experience', 'today', 'made', 'excellent', 'back', 'brilliant', 'liked', 'must', 'entertaining', 'amazing', 'everything', 'become', 'next', 'seem', 'enjoyed', 'someone', 'everyone', 'along', 'perfect', 'art', 'done', 'often', 'may', 'could', 'whole']\n"
      ]
     }
    ],

diff --git a/demos/main_notebook.ipynb b/demos/main_notebook.ipynb
@@ -359,8 +359,8 @@
     "    \n",
     "    print('Running base version')\n",
     "    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
-    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
-    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
+    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
+    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
     "    \n",
     "    \n",
     "    model.fit(train_df,ytrain)\n",
@@ -471,7 +471,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['totally', 'breakfast', 'does', 'guy', 'mileycyrus', 'doing', 'years', 'cause', 'stop', 'hour', 'shopping', 'just', 'pics', '100', 'dad', 'loved', 'rest', 'wonderful', 'kids', 'fm', 'seen', 'plurk', 'till', 'beach', 'end', 'outside', 'probably', 'tho', 'talking', 'says', 'lmao', 'send', 'busy', 'idea', 'saturday', 'kind', 'tv', 'came', 'having', 'super', 'rock', 'believe', 'true', 'stay', 'don', 'remember', 'room', 'enjoying', 'did', 'lots', 'crazy', 'buy', 'girls', 'taking', 'meet', 'mind', 'quite', 'hopefully', 'weeks', 'using', 'fine', 'ill', 'ah', 'watched']\n"
+      "['room', 'fine', 'guy', 'came', 'send', 'end', 'outside', 'doing', 'beach', 'ah', 'shopping', 'says', 'true', 'breakfast', 'kind', 'hopefully', 'till', 'ill', 'weeks', 'don', 'hour', 'watched', 'tho', 'busy', 'years', 'mind', 'just', 'stop', 'stay', 'pics', '100', 'kids', 'did', 'quite', 'super', 'loved', 'using', 'seen', 'crazy', 'cause', 'meet', 'fm', 'saturday', 'taking', 'tv', 'does', 'believe', 'rest', 'idea', 'probably', 'enjoying', 'having', 'dad', 'lmao', 'mileycyrus', 'talking', 'buy', 'totally', 'remember', 'rock', 'wonderful', 'plurk', 'girls', 'lots']\n"
      ]
     }
    ],
@@ -491,7 +491,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['hurts', 'first', 'name', 'get', 'still', 'give', 'ever', 'poor', 'since', 'well', 'might', 'almost', 'ugh', 'wanted', 'someone', 'go', 'also', 'much', 'least', 'show', 'around', 'see', 'something', 'sucks', 'always', 'may', 'cant', 'must', 'yet', 'never', 'would', 'keep', 'two', 'iphone', 'cold', 'back', 'one', 'missing', 'stupid', 'put', 'made', 'find', 'us', 'take', 'everything', 'next', 'headache', 'even', 'please', 'already', 'found', 'call', 'another', 'many', 'though', 'last', 're', 'shit', 'done', 'gone', 'didnt', 'everyone', 'nothing', 'could']\n"
+      "['also', 'go', 'found', 'gone', 'last', 'might', 'someone', 'missing', 'shit', 'hurts', 'though', 'something', 'always', 'well', 'least', 'one', 'please', 'cant', 'already', 'find', 'must', 'would', 'cold', 'wanted', 'many', 'back', 'around', 'call', 'stupid', 'show', 'even', 'ever', 're', 'could', 'ugh', 'first', 'give', 'next', 'almost', 'everything', 'everyone', 'another', 'made', 'yet', 'didnt', 'two', 'name', 'us', 'never', 'keep', 'much', 'done', 'see', 'still', 'get', 'headache', 'take', 'poor', 'put', 'may', 'since', 'iphone', 'nothing', 'sucks']\n"
      ]
     }
    ],

diff --git a/demos/main_notebook_imdb.ipynb b/demos/main_notebook_imdb.ipynb
@@ -137,8 +137,8 @@
     {
      "data": {
       "text/plain": [
-       "negative    0.5\n",
        "positive    0.5\n",
+       "negative    0.5\n",
        "Name: sentiment, dtype: float64"
       ]
      },
@@ -297,8 +297,8 @@
     "    \n",
     "    print('Running base version')\n",
     "    tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
-    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
-    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
+    "    train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
+    "    test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
     "    \n",
     "    \n",
     "    model.fit(train_df,ytrain)\n",
@@ -409,7 +409,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['hour', 'obviously', 'looked', 'moments', 'style', 'overall', 'playing', 'did', 'yes', 'care', 'tries', 'gore', 'act', 'doing', 'felt', 'entire', 'run', 'god', 'called', 'don', 'girls', 'person', 'came', 'writing', 'sound', 'gave', 'quality', 'hell', 'car', 'children', 'stuff', 'ridiculous', 'kill', 'flick', 'cheap', 'stop', 'blood', 'starts', 'happens', 'face', 'thinking', 'totally', 'attempt', 'annoying', 'case', 'truly', 'nice', 'beginning', 'small', 'picture', 'stars', 'certainly', 'direction', 'written', 'episode', 'does', 'boy', 'wouldn', 'recommend', 'just', 'wanted', 'hope', 'having', 'turn', 'killed', 'ok', 'lead', 'decent', 'game', 'lost', 'example', 'save', 'finally', 'evil', 'title', 'white', 'lack']\n"
+      "['felt', 'totally', 'blood', 'flick', 'yes', 'boy', 'don', 'moments', 'recommend', 'came', 'wouldn', 'tries', 'stop', 'beginning', 'title', 'gore', 'white', 'thinking', 'girls', 'car', 'finally', 'style', 'certainly', 'written', 'annoying', 'episode', 'did', 'lead', 'ridiculous', 'stars', 'decent', 'person', 'case', 'hour', 'quality', 'small', 'children', 'looked', 'face', 'care', 'killed', 'gave', 'lack', 'kill', 'ok', 'doing', 'overall', 'entire', 'called', 'happens', 'attempt', 'does', 'save', 'playing', 'cheap', 'direction', 'wanted', 'sound', 'nice', 'truly', 'evil', 'picture', 'example', 'god', 'stuff', 'hope', 'act', 'turn', 'run', 'hell', 'game', 'starts', 'having', 'lost', 'just', 'writing', 'obviously']\n"
      ]
     }
    ],
@@ -429,7 +429,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['get', 'although', 'every', 'perfect', 'gives', 'else', 'part', 'excellent', 'also', 'would', 'one', 'seemed', 'along', 'less', 'see', 'either', 'name', 'done', 'find', 'another', 'something', 'least', 'nothing', 'almost', 'wonderful', 'loved', 'rather', 'since', 'seem', 'could', 'become', 'back', 'however', 'two', 'much', 'seems', 'last', 'definitely', 'made', 'top', 'performances', 'may', 'someone', 'ever', 'give', 'human', 're', 'never', 'everyone', 'still', 'found', 'us', 'might', 'show', 'go', 'without', 'around', 'today', 'even', 'anyone', 'three', 'everything', 'next', 'must', 'though', 'anything', 'together', 'first', 'put', 'amazing', 'many', 'always', 'yet', 'whole', 'well', 'enough', 'take']\n"
+      "['even', 'definitely', 'since', 'although', 'top', 'also', 'rather', 'always', 'three', 'perfect', 'seemed', 'almost', 'next', 'back', 'us', 'without', 'ever', 'seems', 'never', 'would', 'something', 'every', 'everything', 'someone', 'get', 'around', 'human', 'nothing', 'enough', 'loved', 'whole', 'everyone', 'amazing', 'name', 'still', 'find', 're', 'see', 'well', 'much', 'last', 'another', 'least', 'anything', 'seem', 'however', 'show', 'many', 'found', 'give', 'one', 'take', 'yet', 'become', 'though', 'anyone', 'might', 'excellent', 'less', 'performances', 'could', 'made', 'first', 'wonderful', 'part', 'go', 'today', 'together', 'else', 'two', 'gives', 'must', 'along', 'may', 'done', 'either', 'put']\n"
      ]
     }
    ],