-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecognizer.py
138 lines (115 loc) · 4.77 KB
/
recognizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
from arktweet import CMUTweetTagger
import re
from tweet import Tweet
import nltk
from nltk.tree import Tree
from fuzzysearch import find_near_matches
# Split the tweet in tags using the lib TweetNLP
def identify_relevant_words(input_tweet):
temp_list = list()
temp_list.append(input_tweet.text)
tags = CMUTweetTagger.runtagger_parse(temp_list)
last_start = 0
final_tags = list()
for tag in tags:
# The tags returned by TweetNLP only contain text, type and confidence
# Beside them, we add the the index of the start and end of the tag in the text
for text, typ, confidence in tag:
start = input_tweet.text.find(text, last_start)
end = -1
if start != -1:
end = (start + len(text)) - 1
final_tags.append((text, typ, confidence, start, end))
last_start = end
input_tweet.tags = final_tags
return input_tweet
# Preprocessing a tag generated by TweetNLP.
# Tags considered as thrash are not preprocessed because they will discarded
def pre_processing_tag(tag):
if tag[1] == '#' or tag[1] == '@':
proc_tag = tag[0][1:].replace("_", " ")
#proc_tag = re.sub(r"(?<=\w)([A-Z])", r" \1", proc_tag)
proc_tag = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', proc_tag)
return proc_tag, False, 0
elif tag[1] == 'U' or tag[1] == 'E' or tag[1] == 'G':
return None, True, len(tag[0])
else:
return tag[0], False, 0
def pre_processing_mention(mentions):
preprocessed_mentions = []
for mention in mentions:
surface_name = mention[0].replace("_", " ")
surface_name = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', surface_name)
preprocessed_mentions.append((surface_name, mention[0], mention[1], mention[2]))
return preprocessed_mentions
# Preprocess a tweet by removing words considered as thrash, like urls, emoticons and unidentified
def pre_processing_tweet(input_tweet):
input_tweet.procText = ''
previous_word_end = -1
length_difference = 0
count = 0
proc_tags = list()
length_tags = len(input_tweet.tags)
proc_tag: str
for tag in input_tweet.tags:
proc_tag, is_garbage, len_tag = pre_processing_tag(tag)
length_difference -= len_tag
if is_garbage is False:
if previous_word_end == -1:
input_tweet.procText += proc_tag
else:
space_words = (int(tag[3]) - previous_word_end) - 1
for i in range(space_words):
input_tweet.procText += " "
input_tweet.procText += proc_tag
if count < length_tags:
temp_text, temp_type, temp_conf, temp_start, temp_end = list(tag)
temp_start += length_difference
temp_end += length_difference
local_length = len(proc_tag) - len(temp_text)
temp_end += local_length
length_difference = length_difference + local_length
proc_tags.append((temp_text, temp_type, temp_conf, temp_start, temp_end))
previous_word_end = int(tag[4])
input_tweet.procText = input_tweet.procText.strip()
# inputTweet.procText = " ".join(inputTweet.procText.split())
count += 1
input_tweet.procTags = proc_tags
return input_tweet
def get_mentions_index(input_tweet, mentions):
final_mentions = []
start_mention = 0
for mention in mentions:
start_index = input_tweet.procText.find(mention, start_mention)
if start_index != -1:
end_index = start_index + len(mention)
final_mentions.append((mention, start_index, end_index))
start_mention = end_index + 1
return final_mentions
# Mentions to users (tags of the type @) are considered automatically named entity mentions
# Therefore, they only need to be preprocessed
def identify_users_mention(input_tweet, mentions):
set_mentions = set(mentions)
for tag in input_tweet.tags:
if tag[1] == '@':
proc_tag, is_garbage, len_tag = pre_processing_tag(tag)
set_mentions.add(proc_tag)
return list(set_mentions)
def preprocessing_d2kb(tweet, mentions, verbose):
if verbose == 'yes':
print('..:: Original tweet message ::..')
print(tweet.text)
tweet = identify_relevant_words(tweet)
if verbose == 'yes':
print('\n..:: Pos-tagging tweet message ::..')
print(tweet.tags)
tweet = pre_processing_tweet(tweet)
if verbose == 'yes':
print('\n..:: Preprocessed tweet message ::..')
print(tweet.procText)
tweet.mentions = pre_processing_mention(mentions)
if verbose == 'yes':
print('\n..:: Mentions ::..')
print(tweet.mentions)
return tweet