-
Notifications
You must be signed in to change notification settings - Fork 5
/
clean_text.py
85 lines (66 loc) · 2.76 KB
/
clean_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import re
import string
from alphabet_detector import AlphabetDetector
from bs4 import BeautifulSoup
arabic_diacritics = re.compile("""
ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
def remove_diacritics(text):
text = re.sub(arabic_diacritics, '', text)
return text
def remove_punctuation(s):
my_punctuations = string.punctuation + "،" + "؛" + "؟" + "«" + "»"
translator = str.maketrans('', '', my_punctuations)
return s.translate(translator)
def remove_punctuation2(s): # replace punctuation with space
my_punctuations = string.punctuation + "،" + "؛" + "؟" + "«" + "»"
replace_table = str.maketrans(my_punctuations, ' '*len(my_punctuations))
return s.translate(replace_table)
def html2text(text):
soup = BeautifulSoup(text, 'html.parser')
return soup.get_text()
def remove_links(text):
# return re.sub(r'\s*(?:https?://)?www\.\S*\.[A-Za-z]{2,5}\s*', ' ', text, flags=re.MULTILINE).strip()
# return re.sub(r'^https?:\/\/.*[\r\n]*', '', clean_text, flags=re.MULTILINE)
return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
def remove_empty_lines(text):
lines = [s.rstrip() for s in text.split("\n") if s.rstrip()]
return '\n'.join(lines)
def remove_repeating_char(text):
# return re.sub(r'(.)\1+', r'\1', text) # keep only 1 repeat
return re.sub(r'(.)\1+', r'\1\1', text) # keep 2 repeat
def keep_only_arabic(text):
ad = AlphabetDetector()
clean_lines = list()
for line in text.splitlines():
clean_line = list()
for word in line.split():
if len(word) > 1:
if ad.is_arabic(word):
if word.isalpha():
clean_line.append(word)
clean_lines.append(' '.join(clean_line))
return '\n'.join(clean_lines)
def clean_doc(text):
text = html2text(text)
text = text.replace('.', '\n')
text = text.replace('\\', ' ')
text = text.replace('/', ' ')
text = text.replace('-', ' ')
text = text.replace('(', ' ')
text = text.replace(')', ' ')
clean_text = remove_links(text)
clean_text = remove_diacritics(clean_text)
clean_text = remove_punctuation2(clean_text)
clean_text = keep_only_arabic(clean_text)
clean_text = remove_repeating_char(clean_text)
clean_text = remove_empty_lines(clean_text)
return clean_text