1. Fix crash caused by saving model to a file in non-existent subdire…

…ctory. 2. Fix crash caused by using cPickle.load/dump a file which is not opened with binary mode.
julianser · monsoon235 · Dec 4, 2019 · Dec 6, 2019 · Dec 4, 2019 · 3fab8aa6f72b6f7c581ccd06a6e9564665782d17
commit 3fab8aa6f72b6f7c581ccd06a6e9564665782d17
diff --git a/SS_dataset.py b/SS_dataset.py
@@ -78,7 +78,7 @@ def __init__(self,
         self.exit_flag = False
 
     def load_files(self):
-        self.data = cPickle.load(open(self.dialogue_file, 'r'))
+        self.data = cPickle.load(open(self.dialogue_file, 'rb'))
         self.data_len = len(self.data)
         logger.debug('Data len is %d' % self.data_len)
 

diff --git a/chat.py b/chat.py
@@ -86,7 +86,7 @@ def main():
     state_path = args.model_prefix + "_state.pkl"
     model_path = args.model_prefix + "_model.npz"
 
-    with open(state_path) as src:
+    with open(state_path, "rb") as src:
         state.update(cPickle.load(src)) 
 
     logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

diff --git a/compute_dialogue_embeddings.py b/compute_dialogue_embeddings.py
@@ -109,7 +109,7 @@ def main():
     state_path = args.model_prefix + "_state.pkl"
     model_path = args.model_prefix + "_model.npz"
 
-    with open(state_path) as src:
+    with open(state_path, "rb") as src:
         state.update(cPickle.load(src))
 
     logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
@@ -172,7 +172,7 @@ def main():
             dialogue_encodings.append(encs[i])
 
     # Save encodings to disc
-    cPickle.dump(dialogue_encodings, open(args.output + '.pkl', 'w'))
+    cPickle.dump(dialogue_encodings, open(args.output + '.pkl', 'wb'))
 
 if __name__ == "__main__":
     main()

diff --git a/convert-text2dict.py b/convert-text2dict.py
@@ -47,7 +47,7 @@ def safe_pickle(obj, filename):
 if args.dict != "":
     # Load external dictionary
     assert os.path.isfile(args.dict)
-    vocab = dict([(x[0], x[1]) for x in cPickle.load(open(args.dict, "r"))])
+    vocab = dict([(x[0], x[1]) for x in cPickle.load(open(args.dict, "rb"))])
 
     # Check consistency
     assert '<unk>' in vocab

diff --git a/convert-wordemb-dict2emb-matrix.py b/convert-wordemb-dict2emb-matrix.py
@@ -111,7 +111,7 @@ def edits1(word):
 
 
 # Load model dictionary
-model_dict = cPickle.load(open(args.model_dictionary, 'r'))
+model_dict = cPickle.load(open(args.model_dictionary, 'rb'))
 
 str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in model_dict])
 i_dim = len(str_to_idx.keys())

diff --git a/create-text-file-for-tests.py b/create-text-file-for-tests.py
@@ -75,13 +75,13 @@ def main():
     # Load state file
     state = prototype_state()
     state_path = args.model_prefix + "_state.pkl"
-    with open(state_path) as src:
+    with open(state_path, "rb") as src:
         state.update(cPickle.load(src))
 
     # Load dictionary
 
     # Load dictionaries to convert str to idx and vice-versa
-    raw_dict = cPickle.load(open(state['dictionary'], 'r'))
+    raw_dict = cPickle.load(open(state['dictionary'], 'rb'))
 
     str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in raw_dict])
     idx_to_str = dict([(tok_id, tok) for tok, tok_id, freq, _ in raw_dict])
@@ -95,7 +95,7 @@ def main():
 
     # Is it a pickle file? Then process using model dictionaries..
     if args.test_file[len(args.test_file)-4:len(args.test_file)] == '.pkl':
-        test_dialogues = cPickle.load(open(args.test_file, 'r'))
+        test_dialogues = cPickle.load(open(args.test_file, 'rb'))
         for test_dialogueid,test_dialogue in enumerate(test_dialogues):
             if test_dialogueid % 100 == 0:
                 print 'test_dialogue', test_dialogueid

diff --git a/dialog_encdec.py b/dialog_encdec.py
@@ -1614,7 +1614,7 @@ def __init__(self, state):
         self.rng = numpy.random.RandomState(state['seed']) 
 
         # Load dictionary
-        raw_dict = cPickle.load(open(self.dictionary, 'r'))
+        raw_dict = cPickle.load(open(self.dictionary, 'rb'))
 
         # Probabilities for each term in the corpus used for noise contrastive estimation (NCE)
         self.noise_probs = [x[2] for x in sorted(raw_dict, key=operator.itemgetter(1))]
@@ -1674,7 +1674,7 @@ def __init__(self, state):
         if self.initialize_from_pretrained_word_embeddings == True:
             # Load pretrained word embeddings from pickled file
             logger.debug("Loading pretrained word embeddings")
-            pretrained_embeddings = cPickle.load(open(self.pretrained_word_embeddings_file, 'r'))
+            pretrained_embeddings = cPickle.load(open(self.pretrained_word_embeddings_file, 'rb'))
 
             # Check all dimensions match from the pretrained embeddings
             assert(self.idim == pretrained_embeddings[0].shape[0])

diff --git a/evaluate.py b/evaluate.py
@@ -63,7 +63,7 @@ def main():
     state_path = args.model_prefix + "_state.pkl"
     model_path = args.model_prefix + "_model.npz"
 
-    with open(state_path) as src:
+    with open(state_path, "rb") as src:
         state.update(cPickle.load(src)) 
 
     logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

diff --git a/generate_encodings.py b/generate_encodings.py
@@ -238,7 +238,7 @@ def get_all_encodings(model, encoding_func, sentenceDict, max_length, nb_sent_ba
         #print "end", encodingDict[keys][0,1950:]
 
     print "----> Dummping the encodings..."
-    cPickle.dump(encodingDict, open(outputName + ".pkl", "w"))
+    cPickle.dump(encodingDict, open(outputName + ".pkl", "wb"))
     print "\tL----> Done."
 
     return encodingDict
@@ -254,7 +254,7 @@ def init(path):
     state_path   = path  + "_state.pkl"
     model_path   = path  + "_model.npz"
 
-    with open(state_path) as src:
+    with open(state_path, "rb") as src:
         state.update(cPickle.load(src))
 
     logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

diff --git a/model.py b/model.py
@@ -1,6 +1,7 @@
 import logging
 import numpy
 import theano
+import os
 logger = logging.getLogger(__name__)
 
 # This is the list of strings required to ignore, if we're going to take a pretrained HRED model 
@@ -19,6 +20,7 @@ def save(self, filename):
         Save the model to file `filename`
         """
         vals = dict([(x.name, x.get_value()) for x in self.params])
+        os.makedirs(os.path.split(filename)[0])
         numpy.savez(filename, **vals)
 
     def load(self, filename, parameter_strings_to_ignore=[]):

diff --git a/sample.py b/sample.py
@@ -71,7 +71,7 @@ def main():
     state_path = args.model_prefix + "_state.pkl"
     model_path = args.model_prefix + "_model.npz"
 
-    with open(state_path) as src:
+    with open(state_path, "rb") as src:
         state.update(cPickle.load(src))
 
     logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

diff --git a/split-examples-by-token.py b/split-examples-by-token.py
@@ -64,7 +64,7 @@ def magicsplit(l, *splitters):
     raise Exception("Input file not found!")
 
 logger.info("Loading dialogue corpus")
-data = cPickle.load(open(args.input, 'r'))
+data = cPickle.load(open(args.input, 'rb'))
 data_len = len(data)
 
 logger.info('Corpus loaded... Data len is %d' % data_len)

diff --git a/train.py b/train.py
@@ -65,8 +65,9 @@ def save(model, timings, post_fix = ''):
     start = time.time()
     s = signal.signal(signal.SIGINT, signal.SIG_IGN)
 
+    os.makedirs(model.state['save_dir'])
     model.save(model.state['save_dir'] + '/' + model.state['run_id'] + "_" + model.state['prefix'] + post_fix + 'model.npz')
-    cPickle.dump(model.state, open(model.state['save_dir'] + '/' +  model.state['run_id'] + "_" + model.state['prefix'] + post_fix + 'state.pkl', 'w'))
+    cPickle.dump(model.state, open(model.state['save_dir'] + '/' +  model.state['run_id'] + "_" + model.state['prefix'] + post_fix + 'state.pkl', 'wb'))
     numpy.savez(model.state['save_dir'] + '/' + model.state['run_id'] + "_" + model.state['prefix'] + post_fix + 'timing.npz', **timings)
     signal.signal(signal.SIGINT, s)
 
@@ -135,7 +136,7 @@ def main(args):
         if os.path.isfile(state_file) and os.path.isfile(timings_file):
             logger.debug("Loading previous state")
 
-            state = cPickle.load(open(state_file, 'r'))
+            state = cPickle.load(open(state_file, 'rb'))
             timings = dict(numpy.load(open(timings_file, 'r')))
             for x, y in timings.items():
                 timings[x] = list(y)