-
Notifications
You must be signed in to change notification settings - Fork 2
/
NLTK_helper.py
79 lines (66 loc) · 2.56 KB
/
NLTK_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import nltk
class NLTK_Helper:
def __init__(self):
pass
def process(self, p):
self.p = p
tree_with_entities = self.found_entities(self.p)
self.dic_of_token = self.make_paragraph_tagged(tree_with_entities)
self.string_tagged = self.make_string_tagged(self.dic_of_token)
return self.string_tagged
def found_entities(self, paragraph=""):
paragraph = self.p
paragraph = paragraph.replace(".", " ." )
paragraph = paragraph.replace("\n","")
raw_tokenized = nltk.word_tokenize(paragraph)
raw_text_withpostags = nltk.pos_tag(raw_tokenized)
raw_text_with_entities = nltk.ne_chunk(raw_text_withpostags)
return raw_text_with_entities
def make_paragraph_tagged(self, tree):
#could be done in recoursive way
token_analyzed_list = {}
self.tree = tree
for i,node in enumerate(self.tree):
#print(str(node) + " idx " + str(i))
if isinstance(node,tuple):
#print(str(node) + " is a tuple ")
token_analyzed_list[i] = node[0]
elif isinstance(node, nltk.tree.Tree):
tmp_str = str(node)
#print(tmp_str +" len: " +str(len(node)))
literal = self.make_literal(node)
if "PERSON" in tmp_str:
#testare quanti levelli ci sono
token_analyzed_list[i] = "<persName>"+literal+"</persName>"
#print("this from node: " + str(node[0][0]))
elif "GPE" in tmp_str:
#testare quanti livelli
literal = self.make_literal(node)
token_analyzed_list[i] = "<placeName>"+literal+"</placeName>"
else:
token_analyzed_list[i] = literal
return token_analyzed_list
def make_literal(self, node):
lenght = len(node)
# multi-words are not universal now. Thay are just for two words
if lenght == 2:
literal = str(node[0][0]) +" " + str(node[1][0])
else:
literal = node[0][0]
return literal
def make_string_tagged(self, tokens_as_dic):
count = 0
list_of_word = []
while (count < len(tokens_as_dic)):
list_of_word.append(tokens_as_dic[count])
count+=1
return " ".join(list_of_word).replace(" .",".").replace(" !","!").replace(" ?","?").replace(" ,", ",").replace(" \"", "\"").replace(" ;", ";").replace("`` ", '"').replace("'' ",'"').replace(" ''", '"').replace(" 'll", "'ll").replace(" 're", "'re").replace(" n't", "n't").replace(" 's", "'s")
###########################################
# To test the NLTK_Helper while developing
# without having to deal with XML files
# use the code below:
#test = NLTK_Helper()
#raw_text = "Mike likes pizza from United States ! And Lars (on the other hand)... doesn't like it!"
#processed_text = test.process(raw_text)
#print(processed_text)
##########################################