recognizer.py

# -*- coding: utf-8 -*-


from arktweet import CMUTweetTagger
import re
from tweet import Tweet
import nltk
from nltk.tree import Tree
from fuzzysearch import find_near_matches


# Split the tweet in tags using the lib TweetNLP
def identify_relevant_words(input_tweet):
    temp_list = list()
    temp_list.append(input_tweet.text)
    tags = CMUTweetTagger.runtagger_parse(temp_list)
    last_start = 0
    final_tags = list()
    for tag in tags:
        # The tags returned by TweetNLP only contain text, type and confidence
        # Beside them, we add the the index of the start and end of the tag in the text
        for text, typ, confidence in tag:
            start = input_tweet.text.find(text, last_start)
            end = -1
            if start != -1:
                end = (start + len(text)) - 1
            final_tags.append((text, typ, confidence, start, end))
            last_start = end
    input_tweet.tags = final_tags
    return input_tweet


# Preprocessing a tag generated by TweetNLP.
# Tags considered as thrash are not preprocessed because they will discarded 
def pre_processing_tag(tag):
    if tag[1] == '#' or tag[1] == '@':
        proc_tag = tag[0][1:].replace("_", " ")
        #proc_tag = re.sub(r"(?<=\w)([A-Z])", r" \1", proc_tag)
        proc_tag = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', proc_tag)
        return proc_tag, False, 0
    elif tag[1] == 'U' or tag[1] == 'E' or tag[1] == 'G':
        return None, True, len(tag[0])
    else:
        return tag[0], False, 0

def pre_processing_mention(mentions):
    preprocessed_mentions = []
    for mention in mentions:
        surface_name = mention[0].replace("_", " ")
        surface_name = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', surface_name)
        preprocessed_mentions.append((surface_name, mention[0], mention[1], mention[2]))
    return preprocessed_mentions


# Preprocess a tweet by removing words considered as thrash, like urls, emoticons and unidentified
def pre_processing_tweet(input_tweet):
    input_tweet.procText = ''
    previous_word_end = -1

    length_difference = 0
    count = 0
    proc_tags = list()
    length_tags = len(input_tweet.tags)
    proc_tag: str
    for tag in input_tweet.tags:
        proc_tag, is_garbage, len_tag = pre_processing_tag(tag)
        length_difference -= len_tag
        if is_garbage is False:
            if previous_word_end == -1:
                input_tweet.procText += proc_tag
            else:
                space_words = (int(tag[3]) - previous_word_end) - 1
                for i in range(space_words):
                    input_tweet.procText += " "
                input_tweet.procText += proc_tag

            if count < length_tags:
                temp_text, temp_type, temp_conf, temp_start, temp_end = list(tag)
                temp_start += length_difference
                temp_end += length_difference

                local_length = len(proc_tag) - len(temp_text)
                temp_end += local_length

                length_difference = length_difference + local_length
                proc_tags.append((temp_text, temp_type, temp_conf, temp_start, temp_end))

        previous_word_end = int(tag[4])
        input_tweet.procText = input_tweet.procText.strip()
        # inputTweet.procText = " ".join(inputTweet.procText.split())
        count += 1
    input_tweet.procTags = proc_tags
    return input_tweet

def get_mentions_index(input_tweet, mentions):
    final_mentions = []
    start_mention = 0
    for mention in mentions:
        start_index = input_tweet.procText.find(mention, start_mention)
        if start_index != -1:
            end_index = start_index + len(mention)
            final_mentions.append((mention, start_index, end_index))
            start_mention = end_index + 1
    return final_mentions


# Mentions to users (tags of the type @) are considered automatically named entity mentions
# Therefore, they only need to be preprocessed
def identify_users_mention(input_tweet, mentions):
    set_mentions = set(mentions)
    for tag in input_tweet.tags:
        if tag[1] == '@':
            proc_tag, is_garbage, len_tag = pre_processing_tag(tag)
            set_mentions.add(proc_tag)
    return list(set_mentions)


def preprocessing_d2kb(tweet, mentions, verbose):
    if verbose == 'yes':
        print('..:: Original tweet message ::..')
        print(tweet.text)

    tweet = identify_relevant_words(tweet)
    if verbose == 'yes':
        print('\n..:: Pos-tagging tweet message ::..')
        print(tweet.tags)

    tweet = pre_processing_tweet(tweet)
    if verbose == 'yes':
        print('\n..:: Preprocessed tweet message ::..')
        print(tweet.procText)

    tweet.mentions = pre_processing_mention(mentions)
    if verbose == 'yes':
        print('\n..:: Mentions ::..')
        print(tweet.mentions)

    return tweet