Skip to content

Commit 70fed93

Browse files
committed
add sync before successful RUN_STOP with target accuracy
1 parent ab63028 commit 70fed93

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

mixture_of_experts_pretraining/mlperf_logging_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ def on_train_begin(self, args, state, control, **kwargs):
165165
# device warmup should be done here
166166
self.mllogger.end(key=constants.INIT_STOP, value="")
167167

168+
# run on all ranks to allow sync
168169
barrier()
169170
self.mllogger.start(constants.RUN_START, value="")
170171
self.mllogger.start(
@@ -219,6 +220,9 @@ def on_step_end(
219220
latest_eval_loss = state.log_history[-1]["eval/loss"]
220221
if latest_eval_loss <= self.mllogger.target_eval_loss:
221222
control.should_training_stop = True
223+
224+
# run on all ranks to allow sync
225+
barrier()
222226
self.mllogger.end(
223227
constants.RUN_STOP,
224228
value=latest_eval_loss,

0 commit comments

Comments
 (0)