Updated code

finbarrtimbers · finbarrtimbers · commit 3f716d6ecc50 · 2025-11-06T10:00:16.000-07:00
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -8,3 +8,8 @@
 - To run the `./scripts/train/build_image_and_launch.sh` script, you must commit the current changes.
 - Launch tool use experiments by running `./scripts/train/build_image_and_launch.sh scripts/train/debug/tool_grpo_fast.sh`.
 - Launch multi-node non-tool experiments by running `./scripts/train/build_image_and_launch.sh scripts/train/debug/large_test_script.sh`.
+
+# Comments Policy
+- NEVER remove existing comments from code when making edits unless they are obviously outdated, in which case ALWAYS ask for permission.
+- Always preserve all existing comments, especially explanatory ones
+- Only add comments when they are needed for clarity
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -623,7 +623,7 @@ def maybe_apply_importance_sampling(
     return unclipped_pg_loss, clipped_pg_loss
 
 
-def calculate_loss(
+def calculate_loss_and_backward(
     model: deepspeed.DeepSpeedEngine,
     i: int,
     loss_statistics: LossStatistics,
@@ -638,14 +638,13 @@ def calculate_loss(
     local_step: int,
     args: Args,
 ) -> int:
-    """Calculate and apply GRPO loss for a single minibatch.
+    """Calculate GRPO loss and perform backward pass for a single minibatch.
 
     Computes the policy gradient loss using the clipped surrogate objective from PPO,
-    combines it with a KL penalty term, performs the backward pass, and optionally
-    steps the optimizer.
+    combines it with a KL penalty term, and performs the backward pass.
 
     Args:
-        model: Model wrapper with backward() and step() methods (e.g., DeepSpeed engine)
+        model: Model wrapper with backward() method (e.g., DeepSpeed engine)
         i: Minibatch index for tracking statistics
         loss_statistics: LossStatistics object to accumulate training metrics
         local_logprobs: Log probabilities from current policy (shape: [batch, seq_len])
@@ -688,8 +687,6 @@ def calculate_loss(
     )
     loss = loss / accumulation_steps
     model.backward(loss)
-    if (local_step + 1) % accumulation_steps == 0:
-        model.step()
 
     with torch.no_grad():
         loss_statistics.update_stats(
@@ -1298,7 +1295,7 @@ def train(
                         f"response_mask sum={mb_response_masks_bool.sum()}"
                     )
 
-                    local_step = calculate_loss(
+                    local_step = calculate_loss_and_backward(
                         self.model,
                         i,
                         loss_statistics,
@@ -1313,6 +1310,8 @@ def train(
                         local_step,
                         args,
                     )
+                    if local_step % accumulation_steps == 0:
+                        self.model.step()
 
             local_metrics |= loss_statistics.to_dict()
             local_metrics["lr"] = self.scheduler.get_last_lr()[0]