Ensure last checkpoint decoder results are written to metrics fail when cleaning up training (#368)

fhieber · web-flow · commit 12fab9be9db4 · 2018-04-21T19:50:44.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [1.18.5]
+### Fixed
+- Fixed a problem with trainer not waiting for the last checkpoint decoder (#367).
+
 ## [1.18.4]
 ### Added
 - Added options to control training length w.r.t number of updates/batches or number of samples:
diff --git a/sockeye/__init__.py b/sockeye/__init__.py
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '1.18.4'
+__version__ = '1.18.5'
diff --git a/sockeye/training.py b/sockeye/training.py
@@ -630,7 +630,7 @@ def fit(self,
 
                 tic = time.time()
 
-        self._cleanup(lr_decay_opt_states_reset)
+        self._cleanup(lr_decay_opt_states_reset, process_manager=process_manager)
         logger.info("Training finished. Best checkpoint: %d. Best validation %s: %.6f",
                     self.state.best_checkpoint, early_stopping_metric, self.state.best_metric)
         return self.state.best_metric
@@ -723,7 +723,6 @@ def _update_metrics(self,
             checkpoint_metrics["%s-val" % name] = value
 
         if process_manager is not None:
-            process_manager.wait_to_finish()
             result = process_manager.collect_results()
             if result is not None:
                 decoded_checkpoint, decoder_metrics = result
@@ -749,12 +748,12 @@ def _cleanup(self, lr_decay_opt_states_reset: str, process_manager: Optional['De
         utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep,
                                    self.state.checkpoint, self.state.best_checkpoint)
         if process_manager is not None:
-            process_manager.wait_to_finish()
             result = process_manager.collect_results()
             if result is not None:
                 decoded_checkpoint, decoder_metrics = result
                 self.state.metrics[decoded_checkpoint - 1].update(decoder_metrics)
                 self.tflogger.log_metrics(decoder_metrics, decoded_checkpoint)
+            utils.write_metrics_file(self.state.metrics, self.metrics_fname)
 
         final_training_state_dirname = os.path.join(self.model.output_dir, C.TRAINING_STATE_DIRNAME)
         if os.path.exists(final_training_state_dirname):
@@ -1139,6 +1138,7 @@ def collect_results(self) -> Optional[Tuple[int, Dict[str, float]]]:
             return None
         decoded_checkpoint, decoder_metrics = self.decoder_metric_queue.get()
         assert self.decoder_metric_queue.empty()
+        logger.info("Decoder-%d finished: %s", decoded_checkpoint, decoder_metrics)
         return decoded_checkpoint, decoder_metrics
 
     def wait_to_finish(self):
@@ -1147,14 +1147,15 @@ def wait_to_finish(self):
         if not self.decoder_process.is_alive():
             self.decoder_process = None
             return
-        logger.warning("Waiting for process %s to finish.", self.decoder_process.name)
+        name = self.decoder_process.name
+        logger.warning("Waiting for process %s to finish.", name)
         wait_start = time.time()
         self.decoder_process.join()
         self.decoder_process = None
         wait_time = int(time.time() - wait_start)
-        logger.warning("Had to wait %d seconds for the checkpoint decoder to finish. Consider increasing the "
+        logger.warning("Had to wait %d seconds for the Checkpoint %s to finish. Consider increasing the "
                        "checkpoint frequency (updates between checkpoints, see %s) or reducing the size of the "
-                       "validation samples that are decoded (see %s)." % (wait_time,
+                       "validation samples that are decoded (see %s)." % (wait_time, name,
                                                                           C.TRAIN_ARGS_CHECKPOINT_FREQUENCY,
                                                                           C.TRAIN_ARGS_MONITOR_BLEU))