forked from dennybritz/rnn-tutorial-gru-lstm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
178 lines (156 loc) · 7.68 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#! /usr/bin/env python
import csv
import itertools
import numpy as np
import nltk
import time
import sys
import operator
import io
import array
from datetime import datetime
from gru_theano import GRUTheano
SENTENCE_START_TOKEN = "SENTENCE_START"
SENTENCE_END_TOKEN = "SENTENCE_END"
UNKNOWN_TOKEN = "UNKNOWN_TOKEN"
def load_data(filename="data/reddit-comments-2015-08.csv", vocabulary_size=2000, min_sent_characters=0):
word_to_index = []
index_to_word = []
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print("Reading CSV file...")
with open(filename, 'rt') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
# Split full comments into sentences
sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode("utf-8").lower()) for x in reader])
# Filter sentences
sentences = [s for s in sentences if len(s) >= min_sent_characters]
sentences = [s for s in sentences if "http" not in s]
# Append SENTENCE_START and SENTENCE_END
sentences = ["%s %s %s" % (SENTENCE_START_TOKEN, x, SENTENCE_END_TOKEN) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))
# Get the most common words and build index_to_word and word_to_index vectors
vocab = sorted(word_freq.items(), key=lambda x: (x[1], x[0]), reverse=True)[:vocabulary_size-2]
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
sorted_vocab = sorted(vocab, key=operator.itemgetter(1))
index_to_word = ["<MASK/>", UNKNOWN_TOKEN] + [x[0] for x in sorted_vocab]
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
tokenized_sentences[i] = [w if w in word_to_index else UNKNOWN_TOKEN for w in sent]
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
return X_train, y_train, word_to_index, index_to_word
def train_with_sgd(model, X_train, y_train, learning_rate=0.001, nepoch=20, decay=0.9,
callback_every=10000, callback=None):
num_examples_seen = 0
for epoch in range(nepoch):
# For each training example...
for i in np.random.permutation(len(y_train)):
# One SGD step
model.sgd_step(X_train[i], y_train[i], learning_rate, decay)
num_examples_seen += 1
# Optionally do callback
if (callback and callback_every and num_examples_seen % callback_every == 0):
callback(model, num_examples_seen)
return model
def save_model_parameters_theano(model, outfile):
np.savez(outfile,
E=model.E.get_value(),
U=model.U.get_value(),
W=model.W.get_value(),
V=model.V.get_value(),
b=model.b.get_value(),
c=model.c.get_value())
print "Saved model parameters to %s." % outfile
def load_model_parameters_theano(path, modelClass=GRUTheano):
npzfile = np.load(path)
E, U, W, V, b, c = npzfile["E"], npzfile["U"], npzfile["W"], npzfile["V"], npzfile["b"], npzfile["c"]
hidden_dim, word_dim = E.shape[0], E.shape[1]
print "Building model model from %s with hidden_dim=%d word_dim=%d" % (path, hidden_dim, word_dim)
sys.stdout.flush()
model = modelClass(word_dim, hidden_dim=hidden_dim)
model.E.set_value(E)
model.U.set_value(U)
model.W.set_value(W)
model.V.set_value(V)
model.b.set_value(b)
model.c.set_value(c)
return model
def gradient_check_theano(model, x, y, h=0.001, error_threshold=0.01):
# Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient
model.bptt_truncate = 1000
# Calculate the gradients using backprop
bptt_gradients = model.bptt(x, y)
# List of all parameters we want to chec.
model_parameters = ['E', 'U', 'W', 'b', 'V', 'c']
# Gradient check for each parameter
for pidx, pname in enumerate(model_parameters):
# Get the actual parameter value from the mode, e.g. model.W
parameter_T = operator.attrgetter(pname)(model)
parameter = parameter_T.get_value()
print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
# Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
# Save the original value so we can reset it later
original_value = parameter[ix]
# Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
parameter[ix] = original_value + h
parameter_T.set_value(parameter)
gradplus = model.calculate_total_loss([x],[y])
parameter[ix] = original_value - h
parameter_T.set_value(parameter)
gradminus = model.calculate_total_loss([x],[y])
estimated_gradient = (gradplus - gradminus)/(2*h)
parameter[ix] = original_value
parameter_T.set_value(parameter)
# The gradient for this parameter calculated using backpropagation
backprop_gradient = bptt_gradients[pidx][ix]
# calculate The relative error: (|x - y|/(|x| + |y|))
relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
# If the error is to large fail the gradient check
if relative_error > error_threshold:
print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
print "+h Loss: %f" % gradplus
print "-h Loss: %f" % gradminus
print "Estimated_gradient: %f" % estimated_gradient
print "Backpropagation gradient: %f" % backprop_gradient
print "Relative Error: %f" % relative_error
return
it.iternext()
print "Gradient check for parameter %s passed." % (pname)
def print_sentence(s, index_to_word):
sentence_str = [index_to_word[x] for x in s[1:-1]]
print(" ".join(sentence_str))
sys.stdout.flush()
def generate_sentence(model, index_to_word, word_to_index, min_length=5):
# We start the sentence with the start token
new_sentence = [word_to_index[SENTENCE_START_TOKEN]]
# Repeat until we get an end token
while not new_sentence[-1] == word_to_index[SENTENCE_END_TOKEN]:
next_word_probs = model.predict(new_sentence)[-1]
samples = np.random.multinomial(1, next_word_probs)
sampled_word = np.argmax(samples)
new_sentence.append(sampled_word)
# Seomtimes we get stuck if the sentence becomes too long, e.g. "........" :(
# And: We don't want sentences with UNKNOWN_TOKEN's
if len(new_sentence) > 100 or sampled_word == word_to_index[UNKNOWN_TOKEN]:
return None
if len(new_sentence) < min_length:
return None
return new_sentence
def generate_sentences(model, n, index_to_word, word_to_index):
for i in range(n):
sent = None
while not sent:
sent = generate_sentence(model, index_to_word, word_to_index)
print_sentence(sent, index_to_word)