add sync before successful RUN_STOP with target accuracy

ZhiyuLi-goog · ZhiyuLi-goog · commit 70fed93febad · 2025-01-10T13:56:06.000-08:00
diff --git a/mixture_of_experts_pretraining/mlperf_logging_utils.py b/mixture_of_experts_pretraining/mlperf_logging_utils.py
@@ -165,6 +165,7 @@ def on_train_begin(self, args, state, control, **kwargs):
         # device warmup should be done here
         self.mllogger.end(key=constants.INIT_STOP, value="")
 
+        # run on all ranks to allow sync
         barrier()
         self.mllogger.start(constants.RUN_START, value="")
         self.mllogger.start(
@@ -219,6 +220,9 @@ def on_step_end(
                 latest_eval_loss = state.log_history[-1]["eval/loss"]
             if latest_eval_loss <= self.mllogger.target_eval_loss:
                 control.should_training_stop = True
+
+                # run on all ranks to allow sync
+                barrier()
                 self.mllogger.end(
                     constants.RUN_STOP,
                     value=latest_eval_loss,