NVIDIA · KshitijLakhani · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/tests/jax/test_distributed_fused_attn.py b/tests/jax/test_distributed_fused_attn.py
@@ -327,7 +327,7 @@ def test_cross_attn(
 ]
 
 DISTRIBUTED_CONTEXT_SELF_ATTN_DATA_SHAPES = [
-    # Sequence lengths will be scaled by CP so that we don't run with tiny sizes.
+    # Sequence lengths will be scaled by CP*2 so that we don't run with tiny sizes.
     pytest.param([2, 128, 8, 128], id="2-128xCP-8-128"),
     pytest.param([4, 256, 16, 64], id="4-256xCP-16-64"),
 ]
@@ -351,12 +351,14 @@ def impl_test_context_parallel_attn(
         use_shardy,
         use_scan_ring=False,
         window_size=None,
+        stripe_size=0,
+        num_segments_per_seq=0,
     ):
         if qkv_layout.is_thd():
-            if cp_strategy == CPStrategy.ALL_GATHER:
-                pytest.skip("THD doesn't support all gather context parallelism.")
-            if not load_balanced and cp_strategy == CPStrategy.RING:
-                pytest.skip("THD + ring doesn't support unbalanced context parallelism.")
+            if not load_balanced and (
+                cp_strategy == CPStrategy.RING or cp_strategy == CPStrategy.ALL_GATHER
+            ):
+                pytest.skip(f"THD + {cp_strategy=} doesn't support unbalanced context parallelism.")
 
         assert not use_scan_ring or cp_strategy == CPStrategy.RING
 
@@ -382,7 +384,6 @@ def impl_test_context_parallel_attn(
         data_shape = batch, seqlen, num_head, hidden
 
         num_kv_heads = num_head // kv_groups
-
         runner = FusedAttnRunner(
             batch,
             seqlen,
@@ -407,6 +408,8 @@ def impl_test_context_parallel_attn(
             mesh_resource=mesh_resource,
             cp_strategy=cp_strategy,
             cp_load_balanced=load_balanced,
+            stripe_size=stripe_size,
+            num_segments_per_seq=num_segments_per_seq,
         )
 
         def check_has_backend_for_mask(mask_type):
@@ -457,7 +460,7 @@ def check_has_backend_for_mask(mask_type):
     @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
     @pytest.mark.parametrize(
         "qkv_layout, attn_mask_type",
-        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS,
+        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS[:-1],
     )
     def test_context_parallel_allgather_attn_shardy(
         self,
@@ -486,6 +489,72 @@ def test_context_parallel_allgather_attn_shardy(
             use_shardy=True,
         )
 
+    @pytest_parametrize_wrapper(
+        "device_count,mesh_shape,mesh_axes,mesh_resource",
+        generate_context_parallel_configs_for_attn(),
+    )
+    @pytest.mark.parametrize("data_shape", DISTRIBUTED_CONTEXT_SELF_ATTN_DATA_SHAPES[:1])
+    @pytest.mark.parametrize("kv_groups", [1, 8])
+    @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
+    @pytest.mark.parametrize(
+        "qkv_layout, attn_mask_type",
+        [DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS[-1]],
+    )
+    @pytest.mark.parametrize(
+        "load_balanced",
+        [pytest.param(True, id="BALANCED")],
+    )
+    @pytest.mark.parametrize(
+        "stripe_size",
+        [pytest.param(64, id="STRIPE-64"), pytest.param(128, id="STRIPE-128")],
+    )
+    @pytest.mark.parametrize(
+        "window_size",
+        [
+            pytest.param((-1, -1), id="window_size(-1, -1)"),
+            pytest.param((5, 0), id="window_size(5, 0)"),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "num_segments_per_seq",
+        [pytest.param(2, id="SEG-2"), pytest.param(11, id="SEG-11")],
+    )
+    def test_context_parallel_allgather_striped_attn(
+        self,
+        device_count,
+        mesh_shape,
+        mesh_axes,
+        mesh_resource,
+        data_shape,
+        kv_groups,
+        attn_mask_type,
+        dtype,
+        qkv_layout,
+        load_balanced,
+        window_size,
+        stripe_size,
+        num_segments_per_seq,
+    ):
+        if window_size != (-1, -1) and not qkv_layout.is_thd():
+            pytest.skip("Sliding window attention is only supported for THD layout")
+        self.impl_test_context_parallel_attn(
+            device_count,
+            mesh_shape,
+            mesh_axes,
+            mesh_resource,
+            data_shape,
+            kv_groups,
+            attn_mask_type,
+            dtype,
+            qkv_layout,
+            load_balanced,
+            CPStrategy.ALL_GATHER,
+            use_shardy=False,
+            window_size=window_size,
+            stripe_size=stripe_size,
+            num_segments_per_seq=num_segments_per_seq,
+        )
+
     @pytest_parametrize_wrapper(
         "device_count,mesh_shape,mesh_axes,mesh_resource",
         generate_context_parallel_configs_for_attn(),
@@ -495,7 +564,7 @@ def test_context_parallel_allgather_attn_shardy(
     @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
     @pytest.mark.parametrize(
         "qkv_layout, attn_mask_type",
-        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS,
+        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS[:-1],
     )
     @pytest.mark.parametrize(
         "load_balanced",
@@ -538,7 +607,7 @@ def test_context_parallel_allgather_attn(
     @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
     @pytest.mark.parametrize(
         "qkv_layout, attn_mask_type",
-        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS,
+        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS[:-1],
     )
     @pytest.mark.parametrize(
         "load_balanced",
@@ -602,7 +671,7 @@ def test_context_parallel_ring_attn(
     @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
     @pytest.mark.parametrize(
         "qkv_layout, attn_mask_type",
-        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS,
+        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS[:-1],
     )
     def test_context_parallel_ring_attn_shardy(
         self,
@@ -639,31 +708,39 @@ def test_context_parallel_ring_attn_shardy(
     "L2": [[4, 32, 12, 32], [1, 16, 1, 1]],
 }
 
+REORDER_STRATEGY = [
+    pytest.param(ReorderStrategy.DualChunkSwap, None, id="DualChunkSwap"),
+    pytest.param(ReorderStrategy.Striped, 1, id="Striped-1"),
+    pytest.param(ReorderStrategy.Striped, 4, id="Striped-4"),
+]
+
 
 class TestReorderCausalLoadBalancing:
     @pytest.mark.parametrize("cp_size", [2, 4, 8])
     @pytest_parametrize_wrapper("shape", REORDER_CAUSAL_LOAD_BALANCING_DATA_SHAPES)
-    @pytest.mark.parametrize("qkv_format", [QKVFormat.BSHD, QKVFormat.SBHD])
+    @pytest.mark.parametrize("qkv_format", [QKVFormat.BSHD, QKVFormat.SBHD, QKVFormat.THD])
     @pytest.mark.parametrize(
-        "reorder_strategy",
-        [
-            pytest.param(ReorderStrategy.DualChunkSwap, id="DualChunkSwap"),
-            pytest.param(ReorderStrategy.Striped, id="Striped"),
-        ],
+        "reorder_strategy, stripe_size",
+        REORDER_STRATEGY,
     )
-    def test(self, cp_size, shape, qkv_format, reorder_strategy):
+    def test(self, cp_size, shape, qkv_format, reorder_strategy, stripe_size):
         tensor = random.normal(random.PRNGKey(1124), shape, dtype=jnp.bfloat16)
         seq_dim = 1
         if qkv_format == QKVFormat.SBHD:
             tensor = tensor.swapaxes(0, 1)
             seq_dim = 0
 
+        if reorder_strategy == ReorderStrategy.Striped:
+            seq_lens = shape[seq_dim]
+            if seq_lens < (cp_size * stripe_size):
+                pytest.skip(f"{seq_lens=} must be larger than {cp_size*stripe_size=}")
+
         ref = tensor.copy()
 
-        reorder = jax.jit(reorder_causal_load_balancing, static_argnums=[1, 2, 3])
-        inverse = jax.jit(inverse_reorder_causal_load_balancing, static_argnums=[1, 2, 3])
+        reorder = jax.jit(reorder_causal_load_balancing, static_argnums=[1, 2, 3, 4])
+        inverse = jax.jit(inverse_reorder_causal_load_balancing, static_argnums=[1, 2, 3, 4])
 
-        reordered = reorder(tensor, reorder_strategy, cp_size, seq_dim)
-        inversed = inverse(reordered, reorder_strategy, cp_size, seq_dim)
+        reordered = reorder(tensor, reorder_strategy, cp_size, seq_dim, stripe_size)
+        inversed = inverse(reordered, reorder_strategy, cp_size, seq_dim, stripe_size)
 
         assert jnp.array_equal(inversed, ref)
diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
@@ -230,6 +230,7 @@ def make_mask(
 @jax.jit
 def get_seqlens_and_offsets(segment_ids):
     batch, max_seqlen = segment_ids.shape
+    # TODO: should this be max_seqlen + 1 ?
     bincount_vmap = jax.vmap(partial(jnp.bincount, length=max_seqlen))
     seqlens_with_zero = bincount_vmap(segment_ids.astype(jnp.int32))
     seqlens = seqlens_with_zero[..., 1:]
@@ -352,6 +353,8 @@ class FusedAttnRunner:
     bias_shape: BiasShape
     window_size: Tuple[int, int]
     seq_desc_format: SeqDescFormat
+    stripe_size: int = 0
+    num_segments_per_seq: int = 0
 
     # Specifies sharding resources for distributed tests
     number_of_devices: int = 1
@@ -577,7 +580,9 @@ def generate_random_segment_ids(
             return segment_ids, segment_pos, segment_pad
 
         if self.qkv_layout.is_thd():
-            self.num_segments_per_seq = 2
+            # If using default num segments of 0, set to 2
+            if self.num_segments_per_seq == 0:
+                self.num_segments_per_seq = 2
             self.segment_ids_q, self.segment_pos_q, self.pad_q = generate_random_segment_ids(
                 self.batch_size, self.max_seqlen_q, self.num_segments_per_seq, seed=42
             )
@@ -635,12 +640,14 @@ def generate_random_segment_ids(
                 strategy=reorder_strategy,
                 cp_size=self.cp_size,
                 seq_dim=seq_dim,
+                stripe_size=self.stripe_size,
             )
             self.cp_inverse_reorder_fn = partial(
                 inverse_reorder_causal_load_balancing,
                 strategy=reorder_strategy,
                 cp_size=self.cp_size,
                 seq_dim=seq_dim,
+                stripe_size=self.stripe_size,
             )
         else:
             # no-ops for non cp or non load balanced
@@ -771,7 +778,7 @@ def to_dp_shardings(x):
 
     def test_forward(self):
         """
-        Test forward without JIT
+        Test forward with JITted primitive and unJITted reference
         """
         self._setup_inputs()
 
@@ -801,6 +808,7 @@ def test_forward(self):
             "window_size": self.window_size,
             "context_parallel_strategy": self.cp_strategy,
             "context_parallel_causal_load_balanced": self.cp_load_balanced,
+            "stripe_size": self.stripe_size,
         }
 
         customcall_fused_dpa_jit = jit(
@@ -896,6 +904,7 @@ def grad_func(func, *args, cp_reverse_out=False, **kwargs):
             "window_size": self.window_size,
             "context_parallel_strategy": self.cp_strategy,
             "context_parallel_causal_load_balanced": self.cp_load_balanced,
+            "stripe_size": self.stripe_size,
         }
 
         # We can compute dBias only for the [1, h, s, s] layout

diff --git a/transformer_engine/jax/attention.py b/transformer_engine/jax/attention.py
@@ -386,23 +386,27 @@ def _obtain_batch_and_max_seqlen(qkv, qkv_layout):
     return batch, q_max_seqlen, kv_max_seqlen
 
 
-def reorder_causal_load_balancing(tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int):
+def reorder_causal_load_balancing(
+    tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int, stripe_size: int = 1
+):
     """Reorders a tensor for load balancing the compute of causal attention."""
     if strategy == ReorderStrategy.DualChunkSwap:
         return tex.attention.reorder_causal_dual_chunk_swap(tensor, cp_size, seq_dim, False)
     if strategy == ReorderStrategy.Striped:
-        return tex.attention.reorder_causal_striped(tensor, cp_size, seq_dim, False)
+        # stripe_size > 1 is only supported for CP+THD+AG+Striped
+        return tex.attention.reorder_causal_striped(tensor, cp_size, seq_dim, False, stripe_size)
     raise ValueError(f"Unsupported {strategy=}")
 
 
 def inverse_reorder_causal_load_balancing(
-    tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int
+    tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int, stripe_size: int = 1
 ):
     """Inverse operation of `reorder_causal_load_balancing`."""
     if strategy == ReorderStrategy.DualChunkSwap:
         return tex.attention.reorder_causal_dual_chunk_swap(tensor, cp_size, seq_dim, True)
     if strategy == ReorderStrategy.Striped:
-        return tex.attention.reorder_causal_striped(tensor, cp_size, seq_dim, True)
+        # stripe_size > 1 is only supported for CP+THD+AG+Striped
+        return tex.attention.reorder_causal_striped(tensor, cp_size, seq_dim, True, stripe_size)
     raise ValueError(f"Unsupported {strategy=}")
 
 
@@ -988,7 +992,7 @@ def fused_attn_thd(
     return output
 
 
-@partial(jax.custom_vjp, nondiff_argnums=(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17))
+@partial(jax.custom_vjp, nondiff_argnums=(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18))
 def _fused_attn(
     qkv: Tuple[jnp.ndarray, ...],
     bias: Optional[jnp.ndarray],
@@ -1008,6 +1012,7 @@ def _fused_attn(
     context_parallel_causal_load_balanced: bool,
     context_parallel_axis: str,
     context_checkpoint_name: str = "context",
+    stripe_size: int = 0,
 ):
     output, _ = _fused_attn_fwd_rule(
         qkv,
@@ -1028,6 +1033,7 @@ def _fused_attn(
         context_parallel_causal_load_balanced,
         context_parallel_axis,
         context_checkpoint_name=context_checkpoint_name,
+        stripe_size=stripe_size,
     )
     return output
 
@@ -1051,6 +1057,7 @@ def _fused_attn_fwd_rule(
     context_parallel_causal_load_balanced,
     context_parallel_axis,
     context_checkpoint_name,
+    stripe_size,
 ):
     output, softmax_aux, rng_state = tex.fused_attn_fwd(
         qkv,
@@ -1070,6 +1077,7 @@ def _fused_attn_fwd_rule(
         context_parallel_strategy=context_parallel_strategy,
         context_parallel_causal_load_balanced=context_parallel_causal_load_balanced,
         context_parallel_axis=context_parallel_axis,
+        stripe_size=stripe_size,
     )
     output = checkpoint_name(output, context_checkpoint_name)
     softmax_aux = checkpoint_name(softmax_aux, context_checkpoint_name)
@@ -1099,6 +1107,7 @@ def _fused_attn_bwd_rule(
     context_parallel_causal_load_balanced,
     context_parallel_axis,
     context_checkpoint_name,
+    stripe_size,
     ctx,
     dz,
 ):
@@ -1133,6 +1142,7 @@ def _fused_attn_bwd_rule(
         context_parallel_strategy=context_parallel_strategy,
         context_parallel_causal_load_balanced=context_parallel_causal_load_balanced,
         context_parallel_axis=context_parallel_axis,
+        stripe_size=stripe_size,
     )
     if attn_bias_type == AttnBiasType.NO_BIAS:
         grad_bias = None
@@ -1169,6 +1179,7 @@ def fused_attn(
     context_parallel_axis: str = "",
     context_checkpoint_name: str = "context",
     softmax_offset: Optional[jnp.ndarray] = None,
+    stripe_size: int = 0,
 ):
     """
     Perform cuDNN fused attention.
@@ -1206,6 +1217,10 @@ def fused_attn(
         softmax_offset (Optional[jnp.ndarray]): An optional learnable softmax offset tensor with shape
             [1, num_heads, 1, 1]. Used when softmax_type is AttnSoftmaxType.LEARNABLE_SOFTMAX.
             If provided, this parameter will receive gradients during backpropagation.
+        stripe_size (int):
+            Indicates the striping size to be used when using ReorderStrategy.Striped.
+            Currently, a stripe_size > 1 is only allowed for CP + THD + Striped + AG
+            0 indicates no striping strategy
     Returns:
         (jnp.ndarray): The output tensor from the fused attention.
 
@@ -1283,5 +1298,6 @@ def fused_attn(
         context_parallel_causal_load_balanced=context_parallel_causal_load_balanced,
         context_parallel_axis=context_parallel_axis,
         context_checkpoint_name=context_checkpoint_name,
+        stripe_size=stripe_size,
     )
     return output