-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
43 lines (33 loc) · 1.49 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import spacy
import re
nlp = spacy.load('de_core_news_sm')
def clean_text(x):
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#',
'*', '+', '\\', '•', '~', '@', '£',
'·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â',
'█', '½', 'à', '…',
'“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―',
'¥', '▓', '—', '‹', '─',
'▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸',
'¾', 'Ã', '⋅', '‘', '∞',
'∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
'¹', '≤', '‡', '√', ]
x = str(x)
for punct in puncts:
x = x.replace(punct, f' {punct} ')
return x
def remove_names(x):
for word in x.split():
if word[0] == "@":
x = x.replace(word, "")
return x
def remove_url(x):
result = re.sub(r"http\S+", "", x)
return result
def entity_recognizing(x):
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PER':
print(ent.text)
x = x.replace(ent.text, "name")
return x