keras_textvae/preprocessing_utils.py at master · jderiu/keras_textvae · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer
import re


def preprocess_nlg_text(text, name, near, food, name_tok, near_tok, food_tok, word_based=False):

    text = text.replace('\n', '').replace('\r', '').replace('\t', ' ')

    if name is not '':
        text = text.replace(name, name_tok)

    if near is not '':
        text = text.replace(near, near_tok)

    if food is not '':
        text = text.replace(food, food_tok)

    if word_based:
        tokenizer = WordPunctTokenizer()
        tokens = tokenizer.tokenize(text)
    else:
        tokens = text

    return tokens

def preprocess(tweet):
    tokenzer = TweetTokenizer()
    #lowercase and normalize urls
    tweet = tweet.lower()
    tweet = tweet.replace('\n', '')
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', 'URLTOK', tweet)
    tweet = re.sub('@[^\s]+', 'USRTOK', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    tweet = tokenzer.tokenize(tweet)

    return list(map(lambda x: x.replace(' ', ''), tweet))


def preprocess_char_x_word(tweet):
    tokenzer = TweetTokenizer()
    # lowercase and normalize urls
    tweet = tweet.lower()
    tweet = tweet.replace('\n', '')
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', 'URLTOK', tweet)
    tweet = re.sub('@[^\s]+', 'USRTOK', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    tweet_tok = tokenzer.tokenize(tweet)

    return list(map(lambda x: x.replace(' ', ''), tweet_tok))


def convert2indices(data, alphabet, dummy_word_idx, unk_word_idx, max_sent_length=140, verbose=0):
    data_idx = []
    max_len = 0
    unknown_words = 0
    known_words = 0
    for sentence in data:
        ex = np.ones(max_sent_length)*dummy_word_idx
        max_len = max(len(sentence), max_len)
        if len(sentence) > max_sent_length:
            sentence = sentence[:max_sent_length]
        for i, token in enumerate(sentence):
            idx = alphabet.get(token, unk_word_idx)
            ex[i] = idx
            if idx == unk_word_idx:
                unknown_words += 1
            else:
                known_words += 1
        data_idx.append(ex)
    data_idx = np.array(data_idx).astype('float32')
    if verbose == 1:
        print("Max length in this batch:", max_len)
        print("Number of unknown words:", unknown_words)
        print("Number of known words:", known_words)
    return data_idx


def hybrid_convert2indices(data, tokenized_data, alphabet, dummy_word_idx, unk_word_idx, max_sent_length=128, verbose=0):
    data_idx = []
    max_len = 0
    unknown_words = 0
    known_words = 0
    for sentence, sent_toks in zip(data, tokenized_data):
        sentence = sentence.lower()
        ex = np.ones(max_sent_length)*dummy_word_idx
        max_len = max(len(sent_toks), max_len)
        if len(sent_toks) > max_sent_length:
            sent_toks = sent_toks[:max_sent_length]
        sent_ptr = 0
        for i, token in enumerate(sent_toks):
            vocab_idx = alphabet.get(token, (unk_word_idx, 1))[0]
            sidx = sentence.find(token, sent_ptr)
            ex[sidx:sidx + len(token)] = vocab_idx
            if vocab_idx == unk_word_idx:
                unknown_words += 1
            else:
                known_words += 1
        data_idx.append(ex)
    data_idx = np.array(data_idx).astype('float32')
    if verbose == 1:
        print("Max length in this batch:", max_len)
        print("Number of unknown words:", unknown_words)
        print("Number of known words:", known_words)
    return data_idx