-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalizePosts.py
151 lines (124 loc) · 5.42 KB
/
normalizePosts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import re
import pandas as pd
import nltk
import spacy
import inflect
from contractions import CONTRACTION_MAP
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# the following words might be useful to keep for the purpose of sentiment classification
# i.e. 'no' or 'not' could indicate negative sentiment
words_list_keep = ["no", "not", "shoudn't", "couldn't", "aren't", "didn't"]
def change_stopwords(words_list = words_list_keep):
for word in words_list:
try:
stopword_list.remove(word)
# print(word, 'removed from stopwords list')
except:
continue
change_stopwords(words_list_keep)
# import training data
input_file = './data/raw_df.csv'
output_file = './data/normalized_df.csv'
train_df = pd.read_csv(input_file, encoding='utf-8')
train_df = train_df.fillna('')
train_df['shared_text'] = train_df['shared_text'].apply(lambda x: x.split('\n', 1)[-1])
# other functionalities:
# n-gram tagging so that proper nouns stay together
# this function removes various types of left/right quote marks
# and also replaces single left/right quote marks with regular ones
def fix_quotes(text):
pattern = r'["“”„”«»]'
text = re.sub(pattern, '', text)
pattern = r'[‛’]'
text = re.sub(pattern, '\'', text)
# replaces dashes with spaces so that strong-willed --> strongwilled doesn't happen
pattern = r'[-—]'
text = re.sub(pattern, ' ', text)
return text
# NOTE: this does not work for contractions inside quotations
def expand_contractions(words, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, words)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
# perhaps we can perform a count of exclams '!' separately
# gets rid of punctuation (i.e. '.', '!', '?')
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
# removes short words of little use (e.g. 'and', 'the'...)
def remove_stopwords(words, is_lower_case=False):
tokens = tokenizer.tokenize(words)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopword_list]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
filtered_words = ' '.join(filtered_tokens)
return filtered_words
# we use the inflect library to replace numbers with textual representations
# (way easier than my hand-made dictionary!)
def replace_numbers(words):
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
# needed to lemmatize the first element of the word tuple (string, POS)
# may want to adjust lemmatization depending on POS (noun, verb, adjective)
def lemmatize(words):
lemmatizer = WordNetLemmatizer()
lemmatized_words = []
for word in words:
lemma = lemmatizer.lemmatize(word[0])
lemmatized_words.append(lemma)
return lemmatized_words
## normalizing steps:
# combine post text and shared text fields
train_df.insert(0, 'text', train_df['post_text'] + ' ' + train_df['shared_text'])
train_df = train_df.drop(columns = ['post_text', 'shared_text'])
# drop rows where the number of shares is 0: this is a bug in facebook_scraper
train_df = train_df[train_df['shares'] != 0]
train_df = train_df.reset_index(drop = True)
# dummify image field: 1 if there was an image, 0 if there wasn't
train_df['image'] = train_df['image'].apply(lambda x: 0 if x == '' else 1)
# print('raw expression', train_df.text[139])
# normalize text: remove quotes THEN expand contractions, remove special characters and stopwords
print('Normalizing Facebook Post text...')
print('Removing quotation symbols...')
train_df['text'] = train_df['text'].apply(lambda x: fix_quotes(x))
# print('after removing quotation symbols...')
# test = train_df.text[139]
print('Expanding contractions...')
train_df['text'] = train_df['text'].apply(lambda x: expand_contractions(x))
# print('after expanding contractions...')
print('Removing special characters...')
train_df['text'] = train_df['text'].apply(lambda x: remove_special_characters(x))
# print('removing special characters...')
## reposts with duplicate text may need to be removed: take the average of likes, comments, shares
# train_df['text'] = train_df['text'].apply(lambda x: remove_stopwords(x))
# print('after removing stop words...', train_df.text[139])
# tokenize by splitting each sentence before lemmatizing
# train_df['text'] = train_df['text'].apply(lambda x: pos_tag(x.split()))
# print('tokenizing sentences and tagging...', train_df.text[10])
# train_df['text'] = train_df['text'].apply(lambda x: lemmatize(x))
# print('after lemmatizing...', train_df.text[139])
train_df.to_csv(output_file, index = False, encoding='utf-8')