-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre_process.py
451 lines (342 loc) · 16.3 KB
/
pre_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
import re
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import sentiwordnet as swn
import ssl
from functools import partial
from collections import Counter
import string
from time import time
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
""" Pre Processing Tweeets """
def removeUnicode(text):
""" Removes unicode strings like "\u002c" and "x96" """
text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)
text = re.sub(r'[^\x00-\x7f]',r'',text)
return text
def replaceURL(text):
""" Replaces url address with "url" """
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
return text
def replaceAtUser(text):
""" Replaces "@user" with "atUser" """
text = re.sub(r'^RT[\s]+', '', text)
text = re.sub(r"@[^\s]+","",text)
return text
def removeHashtagInFrontOfWord(text):
""" Removes hastag in front of a word """
text = re.sub(r'#([^\s]+)', r'\1', text)
return text
def removeNumbers(text):
""" Removes integers """
text = ''.join([i for i in text if not i.isdigit()])
return text
def replaceMultiExclamationMark(text):
""" Replaces repetitions of exlamation marks """
text = re.sub(r"(\!)\1+", '!', text)
return text
def replaceMultiQuestionMark(text):
""" Replaces repetitions of question marks """
text = re.sub(r"(\?)\1+", '?', text)
return text
def replaceMultiStopMark(text):
""" Replaces repetitions of stop marks """
text = re.sub(r"(\.)\1+", '.', text)
return text
def countMultiExclamationMarks(text):
""" Replaces repetitions of exlamation marks """
return len(re.findall(r"(\!)\1+", text))
def countMultiQuestionMarks(text):
""" Count repetitions of question marks """
return len(re.findall(r"(\?)\1+", text))
def countMultiStopMarks(text):
""" Count repetitions of stop marks """
return len(re.findall(r"(\.)\1+", text))
def countElongated(text):
""" Input: a text, Output: how many words are elongated """
regex = re.compile(r"(.)\1{2}")
return len([word for word in text.split() if regex.search(word)])
def countAllCaps(text):
""" Input: a text, Output: how many words are all caps """
return len(re.findall("[A-Z0-9]{3,}", text))
""" Creates a dictionary with slangs and their equivalents and replaces them """
with open('slang.txt') as file:
slang_map = dict(map(str.strip, line.partition('\t')[::2])
for line in file if line.strip())
slang_words = sorted(slang_map, key=len, reverse=True) # longest first for regex
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])
def countSlang(text):
""" Input: a text, Output: how many slang words and a list of found slangs """
slangCounter = 0
slangsFound = []
tokens = nltk.word_tokenize(text)
for word in tokens:
if word in slang_words:
slangsFound.append(word)
slangCounter += 1
return slangCounter, slangsFound
""" Replaces contractions from a string to their equivalents """
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'),
(r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s','\g<1> is'), (r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would'),
(r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'),
(r'wont', 'will not') ]
def replaceContraction(text):
patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
for (pattern, repl) in patterns:
(text, count) = re.subn(pattern, repl, text)
return text
def replaceElongated(word):
""" Replaces an elongated word with its basic form, unless the word exists in the lexicon """
repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
repl = r'\1\2\3'
if wordnet.synsets(word):
return word
repl_word = repeat_regexp.sub(repl, word)
if repl_word != word:
return replaceElongated(repl_word)
else:
return repl_word
def removeEmoticons(text):
""" Removes emoticons from text """
text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', text)
return text
def countEmoticons(text):
""" Input: a text, Output: how many emoticons """
return len(re.findall(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', text))
### Spell Correction begin ###
""" Spell Correction http://norvig.com/spell-correct.html """
""" Write words all in lower case """
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('corporaForSpellCorrection.txt').read()))
def P(word, N=sum(WORDS.values())):
"""P robability of `word`. """
return WORDS[word] / N
def spellCorrection(word):
""" Most probable spelling correction for word. """
return max(candidates(word), key=P)
def candidates(word):
""" Generate possible spelling corrections for word. """
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
""" The subset of `words` that appear in the dictionary of WORDS. """
return set(w for w in words if w in WORDS)
def edits1(word):
""" All edits that are one edit away from `word`. """
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
""" All edits that are two edits away from `word`. """
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
### Spell Correction End ###
### Replace Negations Begin ###
def replace(word, pos=None):
""" Creates a set of all antonyms for the word and if there is only one antonym, it returns it """
antonyms = set()
for syn in wordnet.synsets(word, pos=pos):
for lemma in syn.lemmas():
for antonym in lemma.antonyms():
antonyms.add(antonym.name())
if len(antonyms) == 1:
return antonyms.pop()
else:
return None
def replaceNegations(text):
""" Finds "not" and antonym for the next word and if found, replaces not and the next word with the antonym """
i, l = 0, len(text)
words = []
while i < l:
word = text[i]
if word == 'not' and i+1 < l:
ant = replace(text[i+1])
if ant:
words.append(ant)
i += 2
continue
words.append(word)
i += 1
return words
### Replace Negations End ###
def addNotTag(text):
""" Finds "not,never,no" and adds the tag NEG_ to all words that follow until the next punctuation """
transformed = re.sub(r'\b(?:not|never|no)\b[\w\s]+[^\w\s]',
lambda match: re.sub(r'(\s+)(\w+)', r'\1NEG_\2', match.group(0)),
text,
flags=re.IGNORECASE)
return transformed
def addCapTag(word):
""" Finds a word with at least 3 characters capitalized and adds the tag ALL_CAPS_ """
if(len(re.findall("[A-Z]{3,}", word))):
word = word.replace('\\', '' )
transformed = re.sub("[A-Z]{3,}", "ALL_CAPS_"+word, word)
return transformed
else:
return word
print("Starting preprocess..\n")
""" Tokenizes a text to its words, removes and replaces some of them """
finalTokens = [] # all tokens
stoplist = stopwords.words('english')
my_stopwords = "multiexclamation multiquestion multistop url atuser st rd nd th am pm" # my extra stopwords
stoplist = stoplist + my_stopwords.split()
allowedWordTypes = ["J","R","V","N"] # J is Adject, R is Adverb, V is Verb, N is Noun. These are used for POS Tagging
lemmatizer = WordNetLemmatizer() # set lemmatizer
stemmer = PorterStemmer() # set stemmer
def tokenize(text, wordCountBefore, textID, y):
totalAdjectives = 0
totalAdverbs = 0
totalVerbs = 0
onlyOneSentenceTokens = [] # tokens of one sentence each time
twt = []
tokens = nltk.word_tokenize(text)
tokens = replaceNegations(tokens) # Technique 6: finds "not" and antonym for the next word and if found, replaces not and the next word with the antonym
translator = str.maketrans('', '', string.punctuation)
text = text.translate(translator) # Technique 7: remove punctuation
#tokens = nltk.word_tokenize(text) # it takes a text as an input and provides a list of every token in it
#POS Tagging
tagged = nltk.pos_tag(tokens) # Technique 13: part of speech tagging
for w in tagged:
if (w[1][0] in allowedWordTypes and w[0] not in stoplist):
final_word = addCapTag(w[0])
final_word = final_word.lower()
final_word = replaceElongated(final_word)
if len(final_word)>1:
final_word = spellCorrection(final_word)
final_word = lemmatizer.lemmatize(final_word)
final_word = stemmer.stem(final_word)
### POS TAGGING END ###
onlyOneSentenceTokens.append(final_word)
finalTokens.append(final_word)
onlyOneSentence = " ".join(onlyOneSentenceTokens) # form again the sentence from the list of tokens
twt = onlyOneSentence
""" Write the preprocessed text to file """
with open("result.txt", "a") as result:
result.write(textID+"\t"+y+"\t"+onlyOneSentence+"\n")
return finalTokens , twt
def penn_to_wn(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
elif tag.startswith('V'):
return wordnet.VERB
return None
# Returns list of pos-neg and objective score. But returns empty list if not present in senti wordnet.
def get_sentiment(word,tag):
wn_tag = penn_to_wn(tag)
if wn_tag not in (wordnet.NOUN, wordnet.ADJ, wordnet.ADV):
return []
#Lemmatization
lemma = lemmatizer.lemmatize(word, pos=wn_tag)
if not lemma:
return []
#Synset is a special kind of a simple interface that is present in NLTK to look up words in WordNet.
#Synset instances are the groupings of synonymous words that express the same concept.
#Some of the words have only one Synset and some have several.
synsets = wordnet.synsets(word, pos=wn_tag)
if not synsets:
return []
# Take the first sense, the most common
synset = synsets[0]
swn_synset = swn.senti_synset(synset.name())
return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]
pos=neg=obj=count=0
###################################################################################
"""
f = open("ss-twitterfinal.txt","r", encoding="utf8", errors='replace').read()
t0 = time()
totalSentences = 0
totalEmoticons = 0
totalSlangs = 0
totalSlangsFound = []
totalElongated = 0
totalMultiExclamationMarks = 0
totalMultiQuestionMarks = 0
totalMultiStopMarks = 0
totalAllCaps = 0
for line in f.split('\n'):
totalSentences += 1
feat = []
columns = line.split('\t')
columns = [col.strip() for col in columns]
textID = (columns[0])
y = (columns[2])
text = removeUnicode(columns[1]) # Technique 0
#print(text) # print initial text
wordCountBefore = len(re.findall(r'\w+', text)) # word count of one sentence before preprocess
#print("Words before preprocess: ",wordCountBefore,"\n")
text = replaceURL(text) # Technique 1
text = replaceAtUser(text) # Technique 1
text = removeHashtagInFrontOfWord(text) # Technique 1
temp_slangs, temp_slangsFound = countSlang(text)
totalSlangs += temp_slangs # total slangs for all sentences
for word in temp_slangsFound:
totalSlangsFound.append(word) # all the slangs found in all sentences
text = replaceSlang(text) # Technique 2: replaces slang words and abbreviations with their equivalents
text = replaceContraction(text) # Technique 3: replaces contractions to their equivalents
text = removeNumbers(text) # Technique 4: remove integers from text
emoticons = countEmoticons(text) # how many emoticons in this sentence
totalEmoticons += emoticons
text = removeEmoticons(text) # removes emoticons from text
totalAllCaps += countAllCaps(text)
totalMultiExclamationMarks += countMultiExclamationMarks(text) # how many repetitions of exlamation marks in this sentence
totalMultiQuestionMarks += countMultiQuestionMarks(text) # how many repetitions of question marks in this sentence
totalMultiStopMarks += countMultiStopMarks(text) # how many repetitions of stop marks in this sentence
text = replaceMultiExclamationMark(text) # Technique 5: replaces repetitions of exlamation marks with the tag "multiExclamation"
text = replaceMultiQuestionMark(text) # Technique 5: replaces repetitions of question marks with the tag "multiQuestion"
text = replaceMultiStopMark(text) # Technique 5: replaces repetitions of stop marks with the tag "multiStop"
totalElongated += countElongated(text) # how many elongated words emoticons in this sentence
tokens = tokenize(text, wordCountBefore, textID, y)
print("Total sentences: ",totalSentences,"\n")
print("Total Words before preprocess: ",len(re.findall(r'\w+', f)))
print("Total Distinct Tokens before preprocess: ",len(set(re.findall(r'\w+', f))))
print("Average word/sentence before preprocess: ",len(re.findall(r'\w+', f))/totalSentences,"\n")
print("Total Words after preprocess: ",len(tokens))
print("Total Distinct Tokens after preprocess: ",len(set(tokens)))
print("Average word/sentence after preprocess: ",len(tokens)/totalSentences,"\n")
print("Total run time: ",time() - t0," seconds\n")
print("Total emoticons: ",totalEmoticons,"\n")
print("Total slangs: ",totalSlangs,"\n")
commonSlangs = nltk.FreqDist(totalSlangsFound)
for (word, count) in commonSlangs.most_common(20): # most common slangs across all texts
print(word,"\t",count)
commonSlangs.plot(20, cumulative=False) # plot most common slangs
print("Total elongated words: ",totalElongated,"\n")
print("Total multi exclamation marks: ",totalMultiExclamationMarks)
print("Total multi question marks: ",totalMultiQuestionMarks)
print("Total multi stop marks: ",totalMultiStopMarks,"\n")
print("Total all capitalized words: ",totalAllCaps,"\n")
#print(tokens)
commonWords = nltk.FreqDist(tokens)
print("Most common words ")
print("Word\tCount")
for (word, count) in commonWords.most_common(100): # most common words across all texts
print(word,"\t",count)
commonWords.plot(100, cumulative=False) # plot most common words
bgm = nltk.collocations.BigramAssocMeasures()
tgm = nltk.collocations.TrigramAssocMeasures()
bgm_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
tgm_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
bgm_finder.apply_freq_filter(5) # bigrams that occur at least 5 times
print("Most common collocations (bigrams)")
print(bgm_finder.nbest(bgm.pmi, 50)) # top 50 bigram collocations
tgm_finder.apply_freq_filter(5) # trigrams that occur at least 5 times
print("Most common collocations (trigrams)")
print(tgm_finder.nbest(tgm.pmi, 20)) # top 20 trigrams collocations
"""