-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_processor.py
71 lines (59 loc) · 2.68 KB
/
text_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from __future__ import unicode_literals
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from utils import save_model, load_tokens
from hazm import *
import re
class TextProcessor:
"""
Provide methods for pre-processing and text conversion.
Attributes:
tokens (list):
sequences (list):
"""
def __init__(self):
self.tokens = []
self.sequences = []
def text_encoding(self, text, max_words, max_len, is_test=False):
"""
Convert text documents to integer sequences. Documents will be having a padding of a max length of max_len words.
:param str text: Input documents
:param int max_words: The maximum number of words to keep. Only the most common num_words-1 words will be kept
:param int max_len: Length of words for padding the documents
:param is_test: If true, load extracted tokens in training phase. Otherwise create the tokens from input text
:return:
"""
if is_test:
tokenizer = load_tokens("tokens.pickle")
else:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text)
self.tokens = tokenizer
save_model("tokens.pickle", self.tokens)
sequences = tokenizer.texts_to_sequences(text)
sequences = pad_sequences(sequences, maxlen=max_len)
self.sequences = sequences
def preprocess_text(self, text):
"""
Pre-process text by normalizing, removing and replacing some tokens.
:param text: Input data text
:return: Normalized and cleaned input data
"""
text = text.tolist()
normalizer = Normalizer()
normalized_data = []
# punctuation = r"""!"#$%&'()*+,،-./:;<=>?@[\]^_`{|}~""" # Alternative method to deal with punctuations
for item in text:
if bool(re.search(r'\d', item)):
# Replace digital characters with some special tokens
item = re.sub('[0-9]{5,}', '#####', item)
item = re.sub('[0-9]{4}', '####', item)
item = re.sub('[0-9]{3}', '###', item)
item = re.sub('[0-9]{2}', '##', item)
item = re.sub('[0-9]', '#', item)
item = re.sub('[A-Za-z]', '#!', item) # Replace non-persian characters with a token
item = re.sub(r'[^\w\s]', '', item) # Remove punctuations
# item = item.translate(str.maketrans('', '', punctuation)) # Alternative method to deal with punctuations
normalized_item = normalizer.normalize(item)
normalized_data.append(normalized_item)
return normalized_data