From 70fdf26a65bf83f0fa7fed09cc6c943bdc4491cb Mon Sep 17 00:00:00 2001 From: adityaranjan Date: Tue, 3 Oct 2023 22:05:53 -0400 Subject: [PATCH] tryin somethin --- axonn/axonn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/axonn/axonn.py b/axonn/axonn.py index 6012020..0ae73d9 100644 --- a/axonn/axonn.py +++ b/axonn/axonn.py @@ -44,7 +44,8 @@ # bf16 all reduce, only applicable with mixed precision _bf16_all_reduce = None # loss_scale -loss_scale = 2.0**16 +# loss_scale = 2.0**16 +loss_scale = 1 max_scale = 2.0**24 min_scale = 2.0**10 scaling_window = 200 @@ -835,6 +836,7 @@ def _allreduce_and_descale(): comm_handle.allreduce(model_grads_fp32, async_op=False) model_grads_bf16.zero_() - local_overflow = _check_nan(model_grads_fp32) - global_overflow = _sync_scale(local_overflow) - fp32_optimizer.skip_next_step = global_overflow + + # local_overflow = _check_nan(model_grads_fp32) + # global_overflow = _sync_scale(local_overflow) + # fp32_optimizer.skip_next_step = global_overflow