-
Notifications
You must be signed in to change notification settings - Fork 0
/
extended_df.py
84 lines (64 loc) · 2.64 KB
/
extended_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Data processing
import pandas as pd
from collections import Counter
# Language processing
from nltk.corpus import stopwords
import nltk
# nltk.download('stopwords') # if you haven't downloaded this yet.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
def load_df(filepath= '/Users/wolfsinem/product-tagging/data/flipkart_com-ecommerce_sample.csv'):
"""This simple function is just for reading purposes and returning a pandas
df
"""
return pd.read_csv(filepath)
def model_dataframe():
"""This function will check if the loaded dataframe has both columns
product_name and description. If yes, make a new model dataframe with an
empty tags column.
"""
df = load_df()
df.dropna(inplace=True)
df.drop_duplicates(subset=['description'],inplace=True)
if 'product_name' and 'description' in df.columns:
model_df = df[['product_name','description']]
pd.options.mode.chained_assignment = None
model_df['tags'] = ""
else:
raise ValueError("Columns product_name and description don't exist")
return model_df
def tokenize_model(sentence):
"""This function goes through the tokenization process for the description
column.
:param df: The final dataframe which the model_dataframe function returns.
:type df: str.
"""
tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(sentence)
new_words = [token.lower() for token in new_words] # set to a lower case
stop_words = set(stopwords.words('english'))
# these are manually filtered words, not the most efficient way
manually_filtered = ['product', 'type', 'fabric', 'material', 'warranty',
'key', 'details', 'use', 'avoid']
filtered_sentence = [w for w in new_words if not w in stop_words
and not w in manually_filtered
]
count_terms = Counter(filtered_sentence).most_common(10)
# extract the first element of each sublist
return [item[0] for item in count_terms]
def extend_tokenized_model():
""" This function uses a for-loop to create a set of tags for each product
description in the dataframe.
"""
model_df = model_dataframe()
token_lists = []
for i in model_df['description']:
token_lists.append(
[x for x in tokenize_model(str((i))) if not any(c.isdigit() for c in x)
])
for i in range(len(model_df.index)):
model_df.at[i,'tags'] = token_lists[i]
return model_df
print(extend_tokenized_model().head())
# this returns a newly made dataframe with an extra 'tags' column for each
# column