-
Notifications
You must be signed in to change notification settings - Fork 0
/
lemmatization.py
41 lines (33 loc) · 1.27 KB
/
lemmatization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
# read in the cleaned file
df_selected = pd.read_pickle('corpus_balanced3_cleaned.pkl')
lemmatizer = WordNetLemmatizer()
i = 0
# Tokenize the complaint
for ind, complaint in df_selected["Consumer complaint narrative"].items():
i = i+1
words = nltk.word_tokenize(complaint)
new_words = []
# Lemmatize the words in the complaint
for word in words:
new_words.append(lemmatizer.lemmatize(word))
s = ' '.join(new_words)
#print(ind, "(old): ", df_selected["Consumer complaint narrative"][ind])
df_selected["Consumer complaint narrative"][ind] = s
#print(ind, ": ", s)
#print("**********************************************************")
if (i % 1000) == 0:
print(i)
# if i == 10:
# break
#print(df_selected["Consumer complaint narrative"])
print("nulls in df_selected:", df_selected["Consumer complaint narrative"].isnull().sum())
# if for some reason, some complaints are null; remove them, because there aren't very many of them anyway
df_bcl = df_selected.dropna()
# to csv for later use
df_selected.to_csv("corpus_balanced3_cleaned_lemmatized.csv", index=False)