-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessor.py
55 lines (42 loc) · 1.39 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
class Preprocessor:
"""Preprocesses the input"""
def process (self, orig_df, inplace):
df = orig_df
if not inplace:
df = orig_df.copy()
column = df.columns[0] # A vector of size m that will be preprocessed
df[column] = df[column].apply(self.cleanup)
if not inplace:
return df
def cleanup (self, s):
s = self.convert_to_UTF(s) # Mandatory
s = self.remove_spaces(s)
s = self.remove_digits(s)
s = self.remove_emojis(s)
return s
def convert_to_UTF(self, s):
s = str(s)
s = unicode(s,"utf-8")
return s
def remove_spaces(self, s):
return s.replace(" ","")
def remove_digits(self, s):
return re.sub("\d+", "", s)
def remove_emojis(self, s):
# Remove all emojis and non letter characters
try:
# Wide UCS-4 build
myre = re.compile(u'['
u'\U0001F300-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
except re.error:
# Narrow UCS-2 build
myre = re.compile(u'('
u'\ud83c[\udf00-\udfff]|'
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
u'[\u2600-\u26FF\u2700-\u27BF])+',
re.UNICODE)
return myre.sub(r'', s)