Skip to content

Commit

Permalink
Added custom weights support and fixed vocabulary ordering
Browse files Browse the repository at this point in the history
  • Loading branch information
SauravPattnaikCS60 committed Nov 12, 2022
1 parent f5cf28d commit de45736
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 90 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,10 @@ test_df = wcbtfidf.transform(xtest)
wcbtfidf.combine_vocab
# Get the class wise vocab
wcbtfidf.class_wise_vocab

# Added support for providing custom features set
wcbtfidf = Wcbtfidf(max_features=100,custom_weights={0:20,1:80}) # This lets you manage how many features you want to assign

# Here xtrain,xtest refers to a single pandas column containing the text data and ytrain ytest the
# categorical output label
```
26 changes: 13 additions & 13 deletions demos/analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,8 @@
" \n",
" print('Running base version')\n",
" tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
" \n",
" \n",
" model.fit(train_df,ytrain)\n",
Expand All @@ -371,7 +371,7 @@
" print(classification_report(ytest,preds))\n",
" \n",
" print('Running my version')\n",
" wcbtfidf = Wcbtfidf(max_features=max_feat)\n",
" wcbtfidf = Wcbtfidf(max_features=max_feat,custom_weights={0:100,1:200})\n",
" wcbtfidf.fit(xtrain,ytrain)\n",
" \n",
" train_df = wcbtfidf.transform(xtrain)\n",
Expand Down Expand Up @@ -410,16 +410,16 @@
"weighted avg 0.88 0.90 0.87 125000\n",
"\n",
"Running my version\n",
"Precision is 0.9100734976221357\n",
"Recall is 0.9916888888888888\n",
"ROC curve is 0.5548844444444444\n",
"Precision is 0.9081446934364691\n",
"Recall is 0.9935022222222222\n",
"ROC curve is 0.5445511111111111\n",
" precision recall f1-score support\n",
"\n",
" 0 0.61 0.12 0.20 12500\n",
" 0 0.62 0.10 0.17 12500\n",
" 1 0.91 0.99 0.95 112500\n",
"\n",
" accuracy 0.90 125000\n",
" macro avg 0.76 0.55 0.57 125000\n",
" macro avg 0.76 0.54 0.56 125000\n",
"weighted avg 0.88 0.90 0.87 125000\n",
"\n"
]
Expand Down Expand Up @@ -448,7 +448,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 15,
"id": "c0b5fa7c",
"metadata": {},
"outputs": [
Expand All @@ -471,15 +471,15 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 16,
"id": "dc686a7e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['remember', 'hopefully', 'stop', 'tv', 'did', 'kids', 'rock', 'having', '100', 'send', 'came', 'taking', 'tho', 'beach', 'enjoying', 'true', 'seen', 'says', 'just', 'stay', 'lmao', 'kind', 'using', 'mileycyrus', 'saturday', 'idea', 'dad', 'loved', 'outside', 'doing', 'quite', 'plurk', 'crazy', 'ill', 'don', 'room', 'meet', 'watched', 'totally', 'talking', 'ah', 'guy', 'shopping', 'wonderful', 'breakfast', 'end', 'years', 'probably', 'lots', 'till', 'believe', 'fine', 'super', 'fm', 'cause', 'pics', 'hour', 'busy', 'rest', 'mind', 'weeks', 'buy', 'does', 'girls']\n"
"['crazy', '100', 'enjoying', 'pics', 'guy', 'idea', 'hour', 'beach', 'leave', 'taking', 'doing', 'talking', 'cause', 'stay', 'aww', 'outside', 'mind', 'fine', 'haven', 'years', 'rock', 'using', 'breakfast', 'doesn', 'ill', 'having', 'busy', 'lmao', 'says', 'remember', 'did', 'plurk', 'probably', 'end', 'weeks', 'room', 'seen', 'car', 'watched', 'bored', 'tv', 'came', 'quite', 'don', 'ah', 'isn', 'dad', 'believe', 'kind', 'tho', 'woke', 'does', 'saturday', 'fm', 'super', 'rest', 'just']\n"
]
}
],
Expand All @@ -499,15 +499,15 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 17,
"id": "56346eb2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['give', 'stupid', 'may', 'didnt', 'nothing', 'yet', 'always', 'well', 'find', 'sucks', 'ever', 'never', 'keep', 'around', 'another', 'almost', 'ugh', 'cold', 'done', 'many', 'hurts', 'one', 'two', 'back', 'show', 're', 'cant', 'everything', 'something', 'take', 'could', 'would', 'shit', 'put', 'everyone', 'already', 'see', 'even', 'though', 'get', 'might', 'headache', 'still', 'gone', 'first', 'must', 'last', 'go', 'someone', 'since', 'much', 'wanted', 'iphone', 'poor', 'call', 'next', 'also', 'please', 'made', 'least', 'name', 'missing', 'found', 'us']\n"
"['least', 'get', 'anything', 'give', 'ever', 'around', 'put', 'anyone', 'might', 'done', 'first', 'always', 'back', 're', 'go', 'last', 'would', 'mine', 'well', 'almost', 'cant', 'keep', 'take', 'ugh', 'please', 'show', 'much', 'may', 'even', 'could', 'nothing', 'someone', 'made', 'find', 'yet', 'next', 'call', 'two', 'must', 'everything', 'also', 'found', 'already', 'see', 'name', 'sucks', 'another', 'still', 'since', 'something', 'us', 'every', 'one', 'everyone', 'many', 'never', 'though']\n"
]
}
],
Expand Down
35 changes: 18 additions & 17 deletions demos/blog_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 1,
"id": "cc4609a5",
"metadata": {},
"outputs": [],
Expand All @@ -20,6 +20,7 @@
"\n",
"import os\n",
"import sys\n",
"\n",
"module_path = os.path.abspath(os.path.join('..'))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
Expand All @@ -29,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 2,
"id": "00783250",
"metadata": {},
"outputs": [],
Expand All @@ -55,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 3,
"id": "9c95a6f2",
"metadata": {},
"outputs": [
Expand All @@ -65,8 +66,8 @@
"text": [
"Shape of the dataset is (50000, 2)\n",
"Target distribution is \n",
"positive 0.5\n",
"negative 0.5\n",
"positive 0.5\n",
"Name: sentiment, dtype: float64\n",
"Final data shape is (25000, 2)\n",
"Final target distribution is \n",
Expand All @@ -82,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 4,
"id": "693f6b13",
"metadata": {},
"outputs": [],
Expand All @@ -96,7 +97,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 5,
"id": "f3a94bcd",
"metadata": {},
"outputs": [],
Expand All @@ -106,7 +107,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 6,
"id": "329ae9b2",
"metadata": {},
"outputs": [
Expand All @@ -129,7 +130,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 19,
"id": "a9a1dd1d",
"metadata": {},
"outputs": [],
Expand All @@ -138,8 +139,8 @@
" \n",
" print('Running TFIDF')\n",
" tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
" \n",
" \n",
" model.fit(train_df,ytrain)\n",
Expand All @@ -162,7 +163,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 20,
"id": "507a98ca",
"metadata": {},
"outputs": [
Expand All @@ -184,7 +185,7 @@
" precision recall f1-score support\n",
"\n",
" 0 0.93 0.99 0.96 5750\n",
" 1 0.75 0.19 0.31 500\n",
" 1 0.74 0.19 0.30 500\n",
"\n",
" accuracy 0.93 6250\n",
" macro avg 0.84 0.59 0.63 6250\n",
Expand All @@ -208,7 +209,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 13,
"id": "f4ea7bd8",
"metadata": {},
"outputs": [
Expand All @@ -231,15 +232,15 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 14,
"id": "66f9c177",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['blood', 'direction', 'starts', 'annoying', 'stuff', 'god', 'car', 'came', 'episode', 'lost', 'wouldn', 'picture', 'totally', 'sound', 'cheap', 'care', 'save', 'quality', 'stars', 'hell', 'certainly', 'thinking', 'beginning', 'face', 'boy', 'yes', 'flick', 'kill', 'nice', 'truly', 'stop', 'killed', 'hope', 'written', 'attempt', 'moments', 'children', 'don', 'having', 'lead', 'person', 'felt', 'called', 'overall', 'wanted', 'white', 'writing', 'finally', 'evil', 'entire', 'obviously', 'does', 'girls', 'happens', 'turn', 'run', 'just', 'did', 'act', 'lack', 'looked', 'small', 'ridiculous', 'doing', 'gave', 'gore', 'title', 'game', 'example', 'hour', 'ok', 'case', 'playing', 'tries', 'recommend', 'decent', 'style']\n"
"['absolutely', 'annoying', 'blood', 'totally', 'run', 'lost', 'hour', 'stuff', 'case', 'simply', 'save', 'waste', 'start', 'does', 'sort', 'starts', 'cheap', 'turn', 'mean', 'game', 'given', 'killed', 'looked', 'attempt', 'getting', 'oh', 'did', 'piece', 'happens', 'understand', 'quality', 'sex', 'small', 'don', 'lack', 'person', 'laugh', 'obviously', 'problem', 'kill', 'god', 'came', 'gore', 'killer', 'flick', 'stop', 'worse', 'having', 'wrong', 'went', 'just', 'girls', 'horrible', 'unfortunately', 'felt', 'decent', 'guess', 'called', 'car', 'tries', 'ok', 'supposed', 'try', 'finally', 'yes', 'use', 'guys', 'face', 'wouldn', 'gave', 'couple', 'production', 'example', 'thinking', 'writing', 'rest', 'children', 'low', 'budget', 'video', 'entire', 'lead', 'dialogue', 'hell', 'sound', 'head', 'couldn', 'wanted', 'direction', 'camera', 'ridiculous', 'doing', 'beginning', 'dead', 'crap', 'care', 'lines', 'act']\n"
]
}
],
Expand All @@ -251,15 +252,15 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 15,
"id": "11af923d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['first', 'though', 'get', 'since', 'show', 'take', 'enough', 'although', 'another', 'seem', 'however', 'excellent', 'made', 'done', 'find', 'even', 'loved', 'definitely', 'back', 'go', 'still', 'anything', 'top', 'someone', 'rather', 'perfect', 'one', 'might', 'else', 'see', 'could', 'human', 'must', 'today', 'also', 'well', 'many', 'performances', 'everything', 'two', 'nothing', 'without', 'ever', 'seems', 'part', 'seemed', 'wonderful', 'every', 'less', 'yet', 'amazing', 'us', 'put', 'together', 'anyone', 'either', 'always', 'along', 're', 'become', 'would', 'gives', 'something', 'never', 'almost', 'may', 'give', 'found', 'around', 'next', 'much', 'name', 'everyone', 'last', 'least', 'three', 'whole']\n"
"['take', 'give', 'heart', 'name', 'also', 'first', 'never', 'might', 'mother', 'matter', 'several', 'highly', 'perhaps', 'anyone', 'able', 'last', 'drama', 'rather', 'two', 'even', 'nothing', 'another', 'seems', 'see', 'three', 'much', 'top', 'since', 'put', 'part', 'find', 'less', 'cinema', 'one', 'almost', 'ever', 'wants', 'enough', 'us', 're', 'would', 'son', 'human', 'performances', 'together', 'something', 'many', 'however', 'yet', 'definitely', 'gives', 'others', 'always', 'every', 'live', 'around', 'go', 'least', 'get', 'though', 'without', 'wonderful', 'anything', 'keep', 'although', 'favorite', 'still', 'lives', 'show', 'found', 'full', 'loved', 'well', 'experience', 'today', 'made', 'excellent', 'back', 'brilliant', 'liked', 'must', 'entertaining', 'amazing', 'everything', 'become', 'next', 'seem', 'enjoyed', 'someone', 'everyone', 'along', 'perfect', 'art', 'done', 'often', 'may', 'could', 'whole']\n"
]
}
],
Expand Down
8 changes: 4 additions & 4 deletions demos/main_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,8 @@
" \n",
" print('Running base version')\n",
" tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
" \n",
" \n",
" model.fit(train_df,ytrain)\n",
Expand Down Expand Up @@ -471,7 +471,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"['totally', 'breakfast', 'does', 'guy', 'mileycyrus', 'doing', 'years', 'cause', 'stop', 'hour', 'shopping', 'just', 'pics', '100', 'dad', 'loved', 'rest', 'wonderful', 'kids', 'fm', 'seen', 'plurk', 'till', 'beach', 'end', 'outside', 'probably', 'tho', 'talking', 'says', 'lmao', 'send', 'busy', 'idea', 'saturday', 'kind', 'tv', 'came', 'having', 'super', 'rock', 'believe', 'true', 'stay', 'don', 'remember', 'room', 'enjoying', 'did', 'lots', 'crazy', 'buy', 'girls', 'taking', 'meet', 'mind', 'quite', 'hopefully', 'weeks', 'using', 'fine', 'ill', 'ah', 'watched']\n"
"['room', 'fine', 'guy', 'came', 'send', 'end', 'outside', 'doing', 'beach', 'ah', 'shopping', 'says', 'true', 'breakfast', 'kind', 'hopefully', 'till', 'ill', 'weeks', 'don', 'hour', 'watched', 'tho', 'busy', 'years', 'mind', 'just', 'stop', 'stay', 'pics', '100', 'kids', 'did', 'quite', 'super', 'loved', 'using', 'seen', 'crazy', 'cause', 'meet', 'fm', 'saturday', 'taking', 'tv', 'does', 'believe', 'rest', 'idea', 'probably', 'enjoying', 'having', 'dad', 'lmao', 'mileycyrus', 'talking', 'buy', 'totally', 'remember', 'rock', 'wonderful', 'plurk', 'girls', 'lots']\n"
]
}
],
Expand All @@ -491,7 +491,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"['hurts', 'first', 'name', 'get', 'still', 'give', 'ever', 'poor', 'since', 'well', 'might', 'almost', 'ugh', 'wanted', 'someone', 'go', 'also', 'much', 'least', 'show', 'around', 'see', 'something', 'sucks', 'always', 'may', 'cant', 'must', 'yet', 'never', 'would', 'keep', 'two', 'iphone', 'cold', 'back', 'one', 'missing', 'stupid', 'put', 'made', 'find', 'us', 'take', 'everything', 'next', 'headache', 'even', 'please', 'already', 'found', 'call', 'another', 'many', 'though', 'last', 're', 'shit', 'done', 'gone', 'didnt', 'everyone', 'nothing', 'could']\n"
"['also', 'go', 'found', 'gone', 'last', 'might', 'someone', 'missing', 'shit', 'hurts', 'though', 'something', 'always', 'well', 'least', 'one', 'please', 'cant', 'already', 'find', 'must', 'would', 'cold', 'wanted', 'many', 'back', 'around', 'call', 'stupid', 'show', 'even', 'ever', 're', 'could', 'ugh', 'first', 'give', 'next', 'almost', 'everything', 'everyone', 'another', 'made', 'yet', 'didnt', 'two', 'name', 'us', 'never', 'keep', 'much', 'done', 'see', 'still', 'get', 'headache', 'take', 'poor', 'put', 'may', 'since', 'iphone', 'nothing', 'sucks']\n"
]
}
],
Expand Down
10 changes: 5 additions & 5 deletions demos/main_notebook_imdb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@
{
"data": {
"text/plain": [
"negative 0.5\n",
"positive 0.5\n",
"negative 0.5\n",
"Name: sentiment, dtype: float64"
]
},
Expand Down Expand Up @@ -297,8 +297,8 @@
" \n",
" print('Running base version')\n",
" tfidf = TfidfVectorizer(max_features=max_feat,stop_words='english')\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.vocabulary_)\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.vocabulary_)\n",
" train_df = pd.DataFrame(tfidf.fit_transform(xtrain).toarray(),columns=tfidf.get_feature_names_out())\n",
" test_df = pd.DataFrame(tfidf.transform(xtest).toarray(),columns=tfidf.get_feature_names_out())\n",
" \n",
" \n",
" model.fit(train_df,ytrain)\n",
Expand Down Expand Up @@ -409,7 +409,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"['hour', 'obviously', 'looked', 'moments', 'style', 'overall', 'playing', 'did', 'yes', 'care', 'tries', 'gore', 'act', 'doing', 'felt', 'entire', 'run', 'god', 'called', 'don', 'girls', 'person', 'came', 'writing', 'sound', 'gave', 'quality', 'hell', 'car', 'children', 'stuff', 'ridiculous', 'kill', 'flick', 'cheap', 'stop', 'blood', 'starts', 'happens', 'face', 'thinking', 'totally', 'attempt', 'annoying', 'case', 'truly', 'nice', 'beginning', 'small', 'picture', 'stars', 'certainly', 'direction', 'written', 'episode', 'does', 'boy', 'wouldn', 'recommend', 'just', 'wanted', 'hope', 'having', 'turn', 'killed', 'ok', 'lead', 'decent', 'game', 'lost', 'example', 'save', 'finally', 'evil', 'title', 'white', 'lack']\n"
"['felt', 'totally', 'blood', 'flick', 'yes', 'boy', 'don', 'moments', 'recommend', 'came', 'wouldn', 'tries', 'stop', 'beginning', 'title', 'gore', 'white', 'thinking', 'girls', 'car', 'finally', 'style', 'certainly', 'written', 'annoying', 'episode', 'did', 'lead', 'ridiculous', 'stars', 'decent', 'person', 'case', 'hour', 'quality', 'small', 'children', 'looked', 'face', 'care', 'killed', 'gave', 'lack', 'kill', 'ok', 'doing', 'overall', 'entire', 'called', 'happens', 'attempt', 'does', 'save', 'playing', 'cheap', 'direction', 'wanted', 'sound', 'nice', 'truly', 'evil', 'picture', 'example', 'god', 'stuff', 'hope', 'act', 'turn', 'run', 'hell', 'game', 'starts', 'having', 'lost', 'just', 'writing', 'obviously']\n"
]
}
],
Expand All @@ -429,7 +429,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"['get', 'although', 'every', 'perfect', 'gives', 'else', 'part', 'excellent', 'also', 'would', 'one', 'seemed', 'along', 'less', 'see', 'either', 'name', 'done', 'find', 'another', 'something', 'least', 'nothing', 'almost', 'wonderful', 'loved', 'rather', 'since', 'seem', 'could', 'become', 'back', 'however', 'two', 'much', 'seems', 'last', 'definitely', 'made', 'top', 'performances', 'may', 'someone', 'ever', 'give', 'human', 're', 'never', 'everyone', 'still', 'found', 'us', 'might', 'show', 'go', 'without', 'around', 'today', 'even', 'anyone', 'three', 'everything', 'next', 'must', 'though', 'anything', 'together', 'first', 'put', 'amazing', 'many', 'always', 'yet', 'whole', 'well', 'enough', 'take']\n"
"['even', 'definitely', 'since', 'although', 'top', 'also', 'rather', 'always', 'three', 'perfect', 'seemed', 'almost', 'next', 'back', 'us', 'without', 'ever', 'seems', 'never', 'would', 'something', 'every', 'everything', 'someone', 'get', 'around', 'human', 'nothing', 'enough', 'loved', 'whole', 'everyone', 'amazing', 'name', 'still', 'find', 're', 'see', 'well', 'much', 'last', 'another', 'least', 'anything', 'seem', 'however', 'show', 'many', 'found', 'give', 'one', 'take', 'yet', 'become', 'though', 'anyone', 'might', 'excellent', 'less', 'performances', 'could', 'made', 'first', 'wonderful', 'part', 'go', 'today', 'together', 'else', 'two', 'gives', 'must', 'along', 'may', 'done', 'either', 'put']\n"
]
}
],
Expand Down
Loading

0 comments on commit de45736

Please sign in to comment.