Substantial code clean up.

julianser · Jun 8, 2016 · db5b9fa · db5b9fa
1 parent 9e2ff35
commit db5b9fa
Show file tree

Hide file tree

Showing 13 changed files with 753 additions and 780 deletions.
diff --git a/Evaluation/embedding_metrics.py b/Evaluation/embedding_metrics.py
@@ -38,7 +38,7 @@ def greedy_match(fileone, filetwo, w2v):
     res2 = greedy_score(filetwo, fileone, w2v)
     res_sum = (res1 + res2)/2.0
 
-    return np.mean(res_sum), np.std(res_sum)
+    return np.mean(res_sum), 1.96*np.std(res_sum)/float(len(res_sum)), np.std(res_sum)
 
 
 def greedy_score(fileone, filetwo, w2v):
@@ -135,7 +135,7 @@ def extrema_score(fileone, filetwo, w2v):
         scores.append(o)
 
     scores = np.asarray(scores)
-    return np.mean(scores), np.std(scores)
+    return np.mean(scores), 1.96*np.std(scores)/float(len(scores)), np.std(scores)
 
 
 def average(fileone, filetwo, w2v):
@@ -175,7 +175,7 @@ def average(fileone, filetwo, w2v):
         scores.append(o)
 
     scores = np.asarray(scores)
-    return np.mean(scores), np.std(scores)
+    return np.mean(scores), 1.96*np.std(scores)/float(len(scores)), np.std(scores)
 
 
 if __name__ == "__main__":
@@ -189,11 +189,11 @@ def average(fileone, filetwo, w2v):
     w2v = Word2Vec.load_word2vec_format(args.embeddings, binary=True)
 
     r = average(args.ground_truth, args.predicted, w2v)
-    print("Embedding Average Score: %f +/- %f " %(r[0], r[1]))
+    print("Embedding Average Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))
 
     r = greedy_match(args.ground_truth, args.predicted, w2v)
-    print("Greedy Matching Score: %f +/- %f " %(r[0], r[1]))
+    print("Greedy Matching Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))
 
     r = extrema_score(args.ground_truth, args.predicted, w2v)
-    print("Extrema Score: %f +/- %f " %(r[0], r[1]))
+    print("Extrema Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))
 
diff --git a/README.md b/README.md
@@ -1,13 +1,20 @@
-# hed-dlg-truncated
-Hierarchical Encoder Decoder RNN (HRED) with Truncated Backpropagation Through Time (Truncated BPTT) for Dialog Modeling. 
+### Description
+This repository hosts the Hierarchical Encoder Decoder RNN model (HRED) and 
+the Latent Variable Hierarchical Recurrent Encoder-Decoder RNN model (VHRED) for generative dialog modeling as described by Serban et al. (2016a) and Serban et al. (2016c).
 
-The truncated computation is based on the trick of splitting each document into shorter sequences (e.g. 80 tokens) and then computing gradients for each sequence separately, but where the hidden state of the RNNs have been initialized from the preceding sequences (i.e. the hidden states have been forward propagated through the previous states).
 
-# Creating A Dataset
 
-The script convert-text2dict.py can be used to generate model datasets based on text files with dialogues. It only requires that the document contains end-of-utterance tokens &lt;/s&gt; which are used to construct the model graph, since the utterance encoder is only connected to the dialogue encoder at the end of each utterance.
+### Truncated BPTT
+Both models are implemented using Truncated Backpropagation Through Time (Truncated BPTT).
+The truncated computation is carried out by splitting each document (dialogue) into shorter sequences (e.g. 80 tokens) and computing gradients for each sequence separately, such that the hidden state of the RNNs on each subsequence are initialized from the preceding sequences (i.e. the hidden states have been forward propagated through the previous states).
 
-Prepare your dataset as a text file for with one document (e.g. movie script or subtitle) per line. The dialogues are assumed to be tokenized. If you have validation and test sets, they must satisfy the same requirements.
+
+
+### Creating Datasets
+The script convert-text2dict.py can be used to generate model datasets based on text files with dialogues.
+It only requires that the document contains end-of-utterance tokens &lt;/s&gt; which are used to construct the model graph, since the utterance encoder is only connected to the dialogue encoder at the end of each utterance.
+
+Prepare your dataset as a text file for with one document per line (e.g. one dialogue per line). The documents are assumed to be tokenized. If you have validation and test sets, they must satisfy the same requirements.
 
 Once you're ready, you can create the model dataset files by running:
 
@@ -28,25 +35,70 @@ NOTE: The script automatically adds the following special tokens specific to mov
 - off screen: &lt;off_screen&gt;
 - pause: &lt;pause&gt;
 
-If these do not exist in your dataset, you can safely ignore these, but remember that your vocabulary will still contain these.
+If these do not exist in your dataset, you can safely ignore these. The model will learn to assign approximately zero probability mass to them.
+
 
-# Training The Model
 
+### Model Training
 If you have Theano with GPU installed (bleeding edge version), you can train the model as follows:
 1) Clone the Github repository
 2) Create a new "Output" and "Data" directories inside it.
 3) Unpack your dataset files into "Data" directory.
-4) Create a new prototype inside state.py (look at prototype_movies or prototype_test as examples)
+4) Create a new prototype inside state.py (look at prototype_ubuntu_HRED for an example)
 5) From the terminal, cd into the code directory and run:
 
-THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python train.py --prototype &lt;prototype_name&gt; &&gt; Model_Output.txt
+    THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python train.py --prototype &lt;prototype_name&gt; &&gt; Model_Output.txt
+
+where &lt;prototype_name&gt; is a state (model architecture) defined inside state.py.
+Training a model to convergence on a modern GPU on the Ubuntu Dialogue Corpus with 46 million tokens takes about 1-2 weeks. If your GPU runs out of memory, you can adjust the bs (batch size) parameter in the model state, but training will be slower. You can also play around with the other parameters inside state.py.
+
+To test a model w.r.t. word perplexity run:
+
+    THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python evaluate.py --exclude-sos --plot-graphs Output/&lt;model_name&gt; Model_Evaluation.txt
+
+where &lt;model_name&gt; is the name automatically generated during training.
+
+
+
+### Model Sampling & Test
+
+To generate model responses using beam search run:
+
+    THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=gpu python sample.py <model_name> <contexts> &lt;model_outputs&gt; --beam_search --n-samples=<beams> --ignore-unk --verbose
+
+where &lt;model_name&gt; is the name automatically generated during training, &lt;contexts&gt; is a file containing the dialogue contexts with one dialogue per line, and &lt;beams&gt; is the size of the beam search. The results are saved in the file &lt;model_outputs&gt;.
+
+To compute the embedding-based metrics on the generated responses run:
+
+    python Evaluation/embedding_metrics.py <ground_truth_responses> <model_outputs> <word_emb> 
+
+where &lt;ground_truth_responses&gt; is a file containing the ground truth responses, and &lt;word_emb&gt; is the path to the binarized word embeddings. For the word embeddings, we recommend to use Word2Vec trained on the GoogleNews Corpus: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM.
+
+
+
+### Citation
+
+If you build on this work, we'd really appreciate it if you could cite our papers:
+
+    A Hierarchical Latent Variable Encoder-Decoder Model for Generating Dialogues. Iulian Vlad Serban, Alessandro Sordoni, Ryan Lowe, Laurent Charlin, Joelle Pineau, Aaron Courville, Yoshua Bengio. 2016. http://arxiv.org/abs/1605.06069
+
+    Building End-To-End Dialogue Systems Using Generative Hierarchical Neural Network Models. Iulian V. Serban, Alessandro Sordoni, Yoshua Bengio, Aaron Courville, Joelle Pineau. 2016. AAAI. http://arxiv.org/abs/1507.04808.
+
+
+### Datasets
+
+The pre-processed Ubuntu Dialogue Corpus and model responses used by Serban et al. (2016a) are available at: http://www.iulianserban.com/Files/UbuntuDialogueCorpus.zip. These can be used with the model states "prototype_ubuntu_LSTM", "prototype_ubuntu_HRED", and "prototype_ubuntu_VHRED" (see state.py) to reproduce the results of Serban et al. (2016a) on the Ubuntu Dialogue Corpus.
+
+The original Ubuntu Dialogue Corpus as released by Lowe et al. (2015) can be found here: http://cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/
+
+The MovieTriples script is available by contacting Iulian Vlad Serban by email, although we strongly recommend researchers to benchmark their models on the Ubuntu Dialogue Corpus and Twitter corpora, as these datasets are substantially larger and represent more well-defined tasks.
 
-For a 7M word dataset, such as the Movie-Scriptolog dataset without any pretraining, this takes about 24 hours to reach convergence.
+### References
 
-To test the model afterwards, you can run:
+    A Hierarchical Latent Variable Encoder-Decoder Model for Generating Dialogues. Iulian Vlad Serban, Alessandro Sordoni, Ryan Lowe, Laurent Charlin, Joelle Pineau, Aaron Courville, Yoshua Bengio. 2016a. http://arxiv.org/abs/1605.06069
 
-THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python evaluate.py --exclude-sos --plot-graphs Output/&lt;model_name&gt; --document_ids Data/Test_Shuffled_Dataset_Labels.txt &&gt; Model_Evaluation.txt
+    Multiresolution Recurrent Neural Networks: An Application to Dialogue Response Generation. Iulian Vlad Serban, Tim Klinger, Gerald Tesauro, Kartik Talamadupula, Bowen Zhou, Yoshua Bengio, Aaron Courville. 2016b. http://arxiv.org/abs/1606.00776.
 
-where &lt;model_name&gt; is the name automatically generated by train.py.
+    Building End-To-End Dialogue Systems Using Generative Hierarchical Neural Network Models. Iulian V. Serban, Alessandro Sordoni, Yoshua Bengio, Aaron Courville, Joelle Pineau. 2016c. AAAI. http://arxiv.org/abs/1507.04808.
 
-If your GPU runs out of memory, you can adjust the bs (batch size) parameter inside the state.py, but training will be slower. You can also play around with the other parameters inside state.py.
+    The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems. Ryan Lowe, Nissan Pow, Iulian Serban, Joelle Pineau. 2015. SIGDIAL. http://arxiv.org/abs/1506.08909.
diff --git a/SS_dataset.py b/SS_dataset.py
@@ -40,6 +40,12 @@ def run(self):
 
                 index = self.indexes[offset]
                 s = diter.data[index]
+
+                # Flatten if this is a list of lists
+                if len(s) > 0:
+                    if isinstance(s[0], list):
+                        s = [item for sublist in s for item in sublist]
+
                 offset += 1
 
                 # Append only if it is shorter than max_len
@@ -57,7 +63,7 @@ class SSIterator(object):
     def __init__(self,
                  dialogue_file,
                  batch_size,
-                 seed=1234,
+                 seed,
                  max_len=-1,
                  use_infinite_loop=True,
                  dtype="int32"):

diff --git a/chat.py b/chat.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 __docformat__ = 'restructedtext en'
-__authors__ = ("Julian Serban, Alessandro Sordoni")
-__contact__ = "Julian Serban <julianserban@gmail.com>"
+__authors__ = ("Iulian Serban, Alessandro Sordoni")
+__contact__ = "Iulian Serban <julianserban@gmail.com>"
 
 import argparse
 import cPickle
@@ -103,7 +103,7 @@ def main():
     #sampler = search.RandomSampler(model)
     sampler = search.BeamSampler(model)
 
-    # Start chat loop    
+    # Start chat loop
     utterances = collections.deque()
 
     while (True):
@@ -114,10 +114,10 @@ def main():
        while len(utterances) > 0:
            utterances.popleft()
 
-       current_utterance = [ model.end_sym_sentence ] + ['<first_speaker>'] + var.split() + [ model.end_sym_sentence ]
+       current_utterance = [ model.end_sym_utterance ] + ['<first_speaker>'] + var.split() + [ model.end_sym_utterance ]
        utterances.append(current_utterance)
 
-       #TODO Sample a random reply. To spicy it up, we could pick the longest reply or the reply with the fewest placeholders...
+       #TODO Sample a random reply. To spice it up, we could pick the longest reply or the reply with the fewest placeholders...
        seqs = list(itertools.chain(*utterances))
 
        #TODO Retrieve only replies which are generated for second speaker...

diff --git a/compute_dialogue_embeddings.py b/compute_dialogue_embeddings.py
@@ -58,45 +58,48 @@ def parse_args():
     return parser.parse_args()
 
 def compute_encodings(joined_contexts, model, model_compute_encoding, output_second_last_state = False):
-    # HACK
     # TODO Fix seqlen below
-    seqlen = 160
+    seqlen = 600
     context = numpy.zeros((seqlen, len(joined_contexts)), dtype='int32')
     context_lengths = numpy.zeros(len(joined_contexts), dtype='int32')
+    second_last_utterance_position = numpy.zeros(len(joined_contexts), dtype='int32')
+
+
     for idx in range(len(joined_contexts)):
         context_lengths[idx] = len(joined_contexts[idx])
         if context_lengths[idx] < seqlen:
             context[:context_lengths[idx], idx] = joined_contexts[idx]
         else:
-            # If context is longer tha max context, truncate it and force the end-of-utterance token at the end
+            # If context is longer tham max context, truncate it and force the end-of-utterance token at the end
             context[:seqlen, idx] = joined_contexts[idx][0:seqlen]
             context[seqlen-1, idx] = model.eos_sym
             context_lengths[idx] = seqlen
 
+        eos_indices = list(numpy.where(context[:context_lengths[idx], idx] == model.eos_sym)[0])
+
+        if len(eos_indices) > 1:
+            second_last_utterance_position[idx] = eos_indices[-2]
+        else:
+            second_last_utterance_position[idx] = context_lengths[idx]
+
     n_samples = len(joined_contexts)
 
     # Generate the reversed context
-    reversed_context = numpy.copy(context)
-    for idx in range(context.shape[1]):
-        eos_indices = numpy.where(context[:, idx] == model.eos_sym)[0]
-        prev_eos_index = -1
-        for eos_index in eos_indices:
-            reversed_context[(prev_eos_index+2):eos_index, idx] = (reversed_context[(prev_eos_index+2):eos_index, idx])[::-1]
-            prev_eos_index = eos_index
-
-    # Recompute hs only for those particular sentences
-    # that met the end-of-sentence token
+    reversed_context = model.reverse_utterances(context)
 
-    encoder_states = model_compute_encoding(context, reversed_context, seqlen)
-    hs = encoder_states[1]
+    encoder_states = model_compute_encoding(context, reversed_context, seqlen+1)
+    hidden_states = encoder_states[-2] # hidden state for the "context" encoder, h_s,
+                                       # and last hidden state of the utterance "encoder", h
+    #hidden_states = encoder_states[-1] # mean for the stochastic latent variable, z
 
     if output_second_last_state:
-        second_last_hidden_state = numpy.zeros((hs.shape[1], hs.shape[2]), dtype='float64')
-        for i in range(hs.shape[1]):
-            second_last_hidden_state[i, :] = hs[context_lengths[i] - 1, i, :]
+        second_last_hidden_state = numpy.zeros((hidden_states.shape[1], hidden_states.shape[2]), dtype='float64')
+        for i in range(hidden_states.shape[1]):
+            second_last_hidden_state[i, :] = hidden_states[second_last_utterance_position[i], i, :]
+
         return second_last_hidden_state
     else:
-        return hs[-1, :, :]
+        return hidden_states[-1, :, :]
 
 
 def main():
@@ -111,19 +114,21 @@ def main():
 
     logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
 
+    state['bs'] = 10
+
     model = DialogEncoderDecoder(state) 
 
     if os.path.isfile(model_path):
         logger.debug("Loading previous model")
         model.load(model_path)
     else:
         raise Exception("Must specify a valid model path")
-    
+
     contexts = [[]]
     lines = open(args.dialogues, "r").readlines()
     if len(lines):
-        contexts = [x.strip().split('\t') for x in lines]
-   
+        contexts = [x.strip() for x in lines]
+
     model_compute_encoding = model.build_encoder_function()
     dialogue_encodings = []
 
@@ -132,22 +137,21 @@ def main():
     batch_index = 0
     batch_total = int(math.ceil(float(len(contexts)) / float(model.bs)))
     for context_id, context_sentences in enumerate(contexts):
-
-        # Convert contextes into list of ids
+        # Convert contexts into list of ids
         joined_context = []
 
         if len(context_sentences) == 0:
             joined_context = [model.eos_sym]
         else:
-            joined_context += [model.eos_sym]
-            for sentence in context_sentences:
-                sentence_ids = model.words_to_indices(sentence.split())
-                # Add eos tokens
-                joined_context += sentence_ids + [model.eos_sym]
+            joined_context = model.words_to_indices(context_sentences.split())
+
+            if joined_context[0] != model.eos_sym:
+                joined_context = [model.eos_sym] + joined_context
+
+            if joined_context[-1] != model.eos_sym:
+                joined_context += [model.eos_sym]
 
-        # HACK
-        #for i in range(0, 50):
-        #    joined_context += [0] + [model.eos_sym]
+        #print 'joined_context', joined_context
 
         joined_contexts.append(joined_context)
 
@@ -173,3 +177,4 @@ def main():
 if __name__ == "__main__":
     main()
 
+    #  THEANO_FLAGS=mode=FAST_COMPILE,floatX=float32 python compute_dialogue_embeddings.py tests/models/1462302387.69_testmodel tests/data/tvalid_contexts.txt Latent_Variable_Means --verbose --use-second-last-state
diff --git a/convert-text2dict.py b/convert-text2dict.py
@@ -30,7 +30,7 @@ def safe_pickle(obj, filename):
 
 import argparse
 parser = argparse.ArgumentParser()
-parser.add_argument("input", type=str, help="Dialogue file; assumed shuffled with one document (e.g. movie, or Twitter conversation) per line")
+parser.add_argument("input", type=str, help="Dialogue file; assumed shuffled with one document (e.g. one movie dialogue, or one Twitter conversation or one Ubuntu conversation) per line")
 parser.add_argument("--cutoff", type=int, default=-1, help="Vocabulary cutoff (optional)")
 parser.add_argument("--dict", type=str, default="", help="External dictionary (pkl file)")
 parser.add_argument("output", type=str, help="Prefix of the pickle binarized dialogue corpus")
@@ -128,7 +128,7 @@ def safe_pickle(obj, filename):
     num_terms += len(dialogue_words)
 
     # Compute document frequency statistics
-    unique_word_indices = set(dialogue_words)
+    unique_word_indices = set(dialogue_word_ids)
     for word_id in unique_word_indices:
         df[word_id] += 1
 
@@ -138,10 +138,6 @@ def safe_pickle(obj, filename):
 safe_pickle(binarized_corpus, args.output + ".dialogues.pkl")
 
 if args.dict == "":
-     # HACK
-     #raw_dict = [(word, word_id, freqs[word_id], df[word_id]) for word, word_id in vocab.items()]
-     #noise_probs = [x[2] for x in sorted(raw_dict, key=operator.itemgetter(1))]
-     #print 'noise_probs', noise_probs
      safe_pickle([(word, word_id, freqs[word_id], df[word_id]) for word, word_id in vocab.items()], args.output + ".dict.pkl")
 
 logger.info("Number of unknowns %d" % unknowns)