updated test

finbarrtimbers · finbarrtimbers · commit 26ee3334b730 · 2025-11-06T13:19:34.000-07:00
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -682,10 +682,13 @@ def calculate_loss_and_backward(
     ref_logprobs_diff = (local_logprobs - ref_logprob).clamp(-40.0, 40.0)
     kl = loss_statistics.update_kl_estimates(i, ref_logprobs_diff, ratio, response_masks_bool, args)
 
-    loss = masked_mean(
-        pg_loss_max + (args.beta * kl), response_masks_bool, args.masked_mean_axis, args.masked_mean_denominator
+    total_loss = pg_loss_max + (args.beta * kl)
+    loss_sum = (total_loss * response_masks_bool).sum()
+    denominator = (
+        args.masked_mean_denominator if args.masked_mean_denominator is not None else response_masks_bool.sum()
     )
-    loss = loss / accumulation_steps
+    loss = loss_sum / denominator
+
     model.backward(loss)
 
     with torch.no_grad():
diff --git a/open_instruct/test_grpo_fast.py b/open_instruct/test_grpo_fast.py
@@ -1085,6 +1085,102 @@ def test_distribution_and_structure(
                     first_pad_idx = padding_mask.nonzero(as_tuple=True)[0][0].item()
                     self.assertTrue(torch.all(row[first_pad_idx:] == pad_token_id))
 
+    def test_calculate_loss_and_backward_no_accumulation_steps_division(self):
+        """Test that loss is NOT divided by accumulation_steps.
+
+        This test verifies the bug fix: the old code incorrectly divided loss by
+        accumulation_steps, which meant gradient magnitude depended on batch splitting
+        configuration rather than actual number of tokens.
+
+        With the fix, loss should be the same regardless of accumulation_steps value
+        (since we're using the same data and token count).
+        """
+        torch.manual_seed(42)
+        np.random.seed(42)
+
+        batch_size = 4
+        seq_len = 16
+
+        local_logprobs = torch.randn(batch_size, seq_len)
+        old_logprobs = torch.randn(batch_size, seq_len)
+        ref_logprob = torch.randn(batch_size, seq_len)
+        advantages = torch.randn(batch_size, seq_len + 1)
+        response_masks_bool = torch.ones(batch_size, seq_len, dtype=torch.bool)
+        entropy = torch.randn(batch_size, seq_len)
+
+        args = grpo_fast.Args()
+        args.clip_lower = 0.2
+        args.clip_higher = 0.2
+        args.beta = 0.05
+        args.kl_estimator = "kl3"
+        args.masked_mean_axis = None
+        args.masked_mean_denominator = None
+        args.truncated_importance_sampling_ratio_cap = 0.0
+        args.record_entropy = False
+
+        mock_model_1 = Mock()
+        mock_model_1.backward = Mock()
+        loss_statistics_1 = grpo_fast.LossStatistics(num_batches=1, record_entropy=False)
+
+        grpo_fast.calculate_loss_and_backward(
+            mock_model_1,
+            0,
+            loss_statistics_1,
+            local_logprobs.clone(),
+            old_logprobs.clone(),
+            ref_logprob.clone(),
+            advantages.clone(),
+            response_masks_bool.clone(),
+            None,
+            entropy.clone(),
+            accumulation_steps=1,
+            local_step=0,
+            args=args,
+        )
+
+        loss_with_accum_1 = mock_model_1.backward.call_args[0][0].item()
+
+        torch.manual_seed(42)
+        np.random.seed(42)
+
+        local_logprobs = torch.randn(batch_size, seq_len)
+        old_logprobs = torch.randn(batch_size, seq_len)
+        ref_logprob = torch.randn(batch_size, seq_len)
+        advantages = torch.randn(batch_size, seq_len + 1)
+        response_masks_bool = torch.ones(batch_size, seq_len, dtype=torch.bool)
+        entropy = torch.randn(batch_size, seq_len)
+
+        mock_model_4 = Mock()
+        mock_model_4.backward = Mock()
+        loss_statistics_4 = grpo_fast.LossStatistics(num_batches=1, record_entropy=False)
+
+        grpo_fast.calculate_loss_and_backward(
+            mock_model_4,
+            0,
+            loss_statistics_4,
+            local_logprobs.clone(),
+            old_logprobs.clone(),
+            ref_logprob.clone(),
+            advantages.clone(),
+            response_masks_bool.clone(),
+            None,
+            entropy.clone(),
+            accumulation_steps=4,
+            local_step=0,
+            args=args,
+        )
+
+        loss_with_accum_4 = mock_model_4.backward.call_args[0][0].item()
+
+        self.assertAlmostEqual(
+            loss_with_accum_1,
+            loss_with_accum_4,
+            places=5,
+            msg=f"Loss should be the same regardless of accumulation_steps. "
+            f"Got {loss_with_accum_1:.6f} (accum=1) vs {loss_with_accum_4:.6f} (accum=4). "
+            f"Old buggy code would have made accum=4 loss 4x smaller.",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()