NVIDIA · jomitchellnv · Nov 14, 2025 · Nov 17, 2025 · greptile-apps · Nov 17, 2025
diff --git a/tests/pytorch/attention/test_cp_utils.py b/tests/pytorch/attention/test_cp_utils.py
@@ -519,7 +519,7 @@ def test_cp_rank_slicing_simple_case(self):
         # Test rank 0
         self._mock_distributed_env(cp_size=2, cp_rank=0)
         input_ids_r0, labels_r0, pos_ids_r0 = get_batch_on_this_cp_rank(
-            cu_seqlens, input_ids, labels, position_ids
+            cu_seqlens, input_ids, labels, position_ids, cp_size=2, cp_rank=0
         )
 
         # Rank 0 should get indices [0,1] and [6,7]
@@ -534,7 +534,7 @@ def test_cp_rank_slicing_simple_case(self):
         # Test rank 1
         self._mock_distributed_env(cp_size=2, cp_rank=1)
         input_ids_r1, labels_r1, pos_ids_r1 = get_batch_on_this_cp_rank(
-            cu_seqlens, input_ids, labels, position_ids
+            cu_seqlens, input_ids, labels, position_ids, cp_size=2, cp_rank=1
         )
 
         # Rank 1 should get indices [2,3] and [4,5]
@@ -561,7 +561,7 @@ def test_cp_rank_slicing_multiple_sequences(self):
         # Test rank 0
         self._mock_distributed_env(cp_size=2, cp_rank=0)
         input_ids_r0, labels_r0, pos_ids_r0 = get_batch_on_this_cp_rank(
-            cu_seqlens, input_ids, labels, position_ids
+            cu_seqlens, input_ids, labels, position_ids, cp_size=2, cp_rank=0
         )
 
         # For each sequence, rank 0 gets first and last slices
@@ -584,7 +584,7 @@ def test_cp_rank_slicing_with_cp_size_1(self):
 
         self._mock_distributed_env(cp_size=1, cp_rank=0)
         input_ids_result, labels_result, pos_ids_result = get_batch_on_this_cp_rank(
-            cu_seqlens, input_ids, labels, position_ids
+            cu_seqlens, input_ids, labels, position_ids, cp_size=1, cp_rank=0
         )
 
         # With CP size = 1, should return original tensors
@@ -608,7 +608,7 @@ def test_cp_rank_slicing_sequence_dim_detection(self):
 
         self._mock_distributed_env(cp_size=2, cp_rank=0)
         input_ids_r0, labels_r0, pos_ids_r0 = get_batch_on_this_cp_rank(
-            cu_seqlens, input_ids, labels, position_ids
+            cu_seqlens, input_ids, labels, position_ids, cp_size=2, cp_rank=0
         )
 
         # Should get indices [0,1] and [6,7] along dimension 0
@@ -635,7 +635,7 @@ def test_cp_rank_slicing_mixed_dimensions(self):
         # Test rank 0
         self._mock_distributed_env(cp_size=2, cp_rank=0)
         input_ids_r0, labels_r0, pos_ids_r0 = get_batch_on_this_cp_rank(
-            cu_seqlens, input_ids, labels, position_ids
+            cu_seqlens, input_ids, labels, position_ids, cp_size=2, cp_rank=0
         )
 
         # Rank 0 should get indices [0,1] and [6,7]
@@ -650,7 +650,7 @@ def test_cp_rank_slicing_mixed_dimensions(self):
         # Test rank 1
         self._mock_distributed_env(cp_size=2, cp_rank=1)
         input_ids_r1, labels_r1, pos_ids_r1 = get_batch_on_this_cp_rank(
-            cu_seqlens, input_ids, labels, position_ids
+            cu_seqlens, input_ids, labels, position_ids, cp_size=2, cp_rank=1
         )
 
         # Rank 1 should get indices [2,3] and [4,5]

diff --git a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
@@ -4,7 +4,7 @@
 
 """Context Parallelism."""
 import os
-from typing import List, Union, Tuple
+from typing import List, Union, Optional, Tuple
 import torch
 import transformer_engine_torch as tex
 
@@ -4016,7 +4016,8 @@ def get_batch_on_this_cp_rank(
     input_ids_padded: torch.Tensor,
     labels_padded: torch.Tensor,
     position_ids_padded: torch.Tensor,
-    cp_group: torch.distributed.ProcessGroup = None,
+    cp_size: Optional[int] = None,
+    cp_rank: Optional[int] = None,
     qvk_format: str = "thd",
 ):
     """Slice batch input along sequence dimension into multiple chunks for THD format.
@@ -4026,14 +4027,18 @@ def get_batch_on_this_cp_rank(
 
     Which are parallelized across GPUs in a context parallel group.
     This version works with variable-length sequences using cumulative sequence lengths.
+
+    If cp_rank is provided, it will slice the batch for the provided rank.
     """
     if qvk_format not in ["thd", "bshd", "sbhd"]:
         raise ValueError(f"Unsupported qvk_format: {qvk_format}!")
     if qvk_format == "thd":
         # Get context parallel size and rank
-        cp_size = torch.distributed.get_world_size(group=cp_group)
         if cp_size > 1:
-        if cp_size > 1:
+        if cp_size is not None and cp_size > 1:
-        if cp_size > 1:
+        if cp_size is not None and cp_size > 1:
-            cp_rank = torch.distributed.get_rank(group=cp_group)
+            if not (0 <= cp_rank < cp_size):
+                raise ValueError(f"cp_rank must be in [0, {cp_size}), but received {cp_rank}.")
+            if cp_rank is None:
+                raise ValueError("cp_rank must be provided when cp_size > 1.")
-            if not (0 <= cp_rank < cp_size):
-                raise ValueError(f"cp_rank must be in [0, {cp_size}), but received {cp_rank}.")
-            if cp_rank is None:
-                raise ValueError("cp_rank must be provided when cp_size > 1.")
+            if cp_rank is None:
+                raise ValueError("cp_rank must be provided when cp_size > 1.")
+            if not (0 <= cp_rank < cp_size):
+                raise ValueError(f"cp_rank must be in [0, {cp_size}), but received {cp_rank}.")
-            if not (0 <= cp_rank < cp_size):
-                raise ValueError(f"cp_rank must be in [0, {cp_size}), but received {cp_rank}.")
-            if cp_rank is None:
-                raise ValueError("cp_rank must be provided when cp_size > 1.")
+            if cp_rank is None:
+                raise ValueError("cp_rank must be provided when cp_size > 1.")
+            if not (0 <= cp_rank < cp_size):
+                raise ValueError(f"cp_rank must be in [0, {cp_size}), but received {cp_rank}.")
 
             # Calculate the chunk sizes for each sequence
             total_slices_of_any_sequence = 2 * cp_size