-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
77 lines (56 loc) · 2.1 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import re
def clean_text(x):
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#',
'*', '+', '\\', '•', '~', '@', '£',
'·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â',
'█', '½', 'à', '…',
'“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―',
'¥', '▓', '—', '‹', '─',
'▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸',
'¾', 'Ã', '⋅', '‘', '∞',
'∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
'¹', '≤', '‡', '√', ]
x = str(x)
for punct in puncts:
x = x.replace(punct, f' {punct} ')
return x
def remove_names(x):
for word in x.split():
if word[0] == "@":
x = x.replace(word, "")
return x
def sep_digits(x):
return " ".join(re.split('(\d+)', x))
def sep_punc(x):
punc = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~؛،؟؛.»«”'
out = []
for char in x:
if char in punc:
out.append(' '+char+' ')
else:
out.append(char)
return ''.join(out)
damma = "ُ"
sukun = "ْ"
fatha = "َ"
kasra = "ِ"
shadda = "ّ"
tanweendam = "ٌ"
tanweenfath = "ً"
tanweenkasr = "ٍ"
tatweel = "ـ"
tashkil = (damma, sukun, fatha, kasra, shadda, tanweendam, tanweenfath, tanweenkasr, tatweel)
def remove_tashkil(word):
w = [letter for letter in word if letter not in tashkil]
return "".join(w)
def clean_arabic(x):
return sep_punc(sep_digits(remove_tashkil(x)))
def normalize(some_string):
normdict = {
'ة': 'ه',
'أ': 'ا',
'إ': 'ا',
'ي': 'ى',
}
out = [normdict.get(x, x) for x in some_string]
return ''.join(out)