fix

pggPL · pggPL · commit dcb23b32473b · 2025-10-02T15:53:30.000Z
Signed-off-by: Pawel Gadzinski &lt;pgadzinski@nvidia.com&gt;
diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
@@ -59,14 +59,16 @@ def init():
     yield
 
 
-@partial(jax.jit, static_argnums=(5, 6, 7, 9))
+@partial(jax.jit, static_argnums=(6, 7, 8, 9, 11))
 def general_dot_product_attention(
     query: ArrayLike,
     key: ArrayLike,
     value: ArrayLike,
+    softmax_offset: Optional[ArrayLike],
     bias: ArrayLike,
     mask: ArrayLike,
     deterministic: bool,
+    softmax_type: AttnSoftmaxType,
     scale_factor: float,
     dropout_rate: float,
     dropout_rng: ArrayLike,
@@ -99,7 +101,25 @@ def general_dot_product_attention(
             mask = jnp.expand_dims(mask, axis=-3)
         logits = jnp.where(mask, jnp.finfo(dtype).min, logits)
 
-    softmax_out = jax.nn.softmax(logits).astype(dtype)
+    match softmax_type:
+        case AttnSoftmaxType.VANILLA_SOFTMAX:
+            softmax_out = jax.nn.softmax(logits).astype(dtype)
+        case AttnSoftmaxType.OFF_BY_ONE_SOFTMAX:
+            # Softmax with +1 in denominator: exp(x_i) / (sum(exp(x_j)) + 1)
+            exp_logits = jnp.exp(logits - jnp.max(logits, axis=-1, keepdims=True))
+            softmax_out = (exp_logits / (jnp.sum(exp_logits, axis=-1, keepdims=True) + 1.0)).astype(dtype)
+        case AttnSoftmaxType.LEARNABLE_SOFTMAX:
+            # Reshape softmax_offset from (1, h_q, 1, 1) to (1, h_kv, num_groups, 1, 1) to match logits
+            # logits shape: (b, h_kv, num_groups, s_q, s_kv)
+            if softmax_offset is not None and softmax_offset.size > 0:
+                softmax_offset_reshaped = softmax_offset.reshape(1, h_kv, num_groups, 1, 1)
+            else:
+                softmax_offset_reshaped = jnp.zeros((1, h_kv, num_groups, 1, 1), dtype=jnp.float32)
+            exp_logits = jnp.exp(logits - jnp.max(logits, axis=-1, keepdims=True))
+            softmax_out = (exp_logits / (jnp.sum(exp_logits, axis=-1, keepdims=True) + jnp.exp(softmax_offset_reshaped))).astype(dtype)
+        case _:
+            raise NotImplementedError(f"Unknown {softmax_type=}")
+    
 
     if not deterministic and dropout_rate > 0.0:
         keep_prob = 1.0 - dropout_rate
@@ -219,19 +239,21 @@ def _split_valid_and_invalid(primitive, reference, pad):
     return primitive_valid, primitive_invalid, reference_valid, reference_invalid
 
 
-def jax_dpa(query, key, value, bias, mask, dropout_rng, **kwargs):
+def jax_dpa(query, key, value, bias, softmax_offset, mask, dropout_rng, **kwargs):
     """
     JAX native dot product attention implementation
     """
     output = general_dot_product_attention(
         query,
         key,
         value,
+        softmax_offset,
         bias,
         mask,
         deterministic=not kwargs["is_training"],
         scale_factor=kwargs["scaling_factor"],
         dropout_rate=kwargs["dropout_probability"],
+        softmax_type=kwargs["softmax_type"],
         dropout_rng=dropout_rng,
         dtype=jnp.float32,
     )
@@ -243,6 +265,7 @@ def customcall_fused_dpa(
     key,
     value,
     bias,
+    softmax_offset,
     sequence_descriptor,
     dropout_rng,
     **kwargs,
@@ -264,7 +287,7 @@ def customcall_fused_dpa(
             qkv_args = (query, key, value)
         case _:
             raise ValueError(f"Unsupported {qkv_layout=}")
-    return fused_attn(qkv_args, bias, sequence_descriptor, dropout_rng, **kwargs).astype(
+    return fused_attn(qkv_args, bias, sequence_descriptor, dropout_rng, softmax_offset=softmax_offset, **kwargs).astype(
         query.dtype
     )
 
@@ -412,7 +435,7 @@ def _setup_inputs(self):
         self.tp_size = self.mesh.shape.get(self.mesh_resource.tpsp_resource, 1)
 
         key = jax.random.PRNGKey(0)
-        q_key, k_key, v_key, bias_key, dropout_key = jax.random.split(key, 5)
+        q_key, k_key, v_key, bias_key, dropout_key, softmax_key = jax.random.split(key, 6)
 
         q_shape = (self.batch_size, self.max_seqlen_q, self.num_heads_q, self.head_dim_qk)
         k_shape = (self.batch_size, self.max_seqlen_kv, self.num_heads_kv, self.head_dim_qk)
@@ -462,6 +485,11 @@ def _setup_inputs(self):
             pad_ratio = 0.3
         else:
             pad_ratio = 0.0
+        
+        if self.softmax_type == AttnSoftmaxType.LEARNABLE_SOFTMAX:
+            self.softmax_offset = jax.random.uniform(softmax_key, (1, self.num_heads_q, 1, 1), self.dtype, -1.0)
+        else:
+            self.softmax_offset = None
 
         def gen_valid(bs, max_seqlen, pad_ratio):
             pad_len = int(max_seqlen * pad_ratio)
@@ -682,6 +710,10 @@ def to_dp_shardings(x):
             self.bias_pspec = PartitionSpec()
         self.bias_sharding = NamedSharding(self.mesh, self.bias_pspec)
 
+        # Softmax offset sharding (1, num_heads, 1, 1)
+        self.softmax_offset_pspec = PartitionSpec(None, self.mesh_resource.tpsp_resource, None, None)
+        self.softmax_offset_sharding = NamedSharding(self.mesh, self.softmax_offset_pspec)
+
         self.dropout_rng_pspec = PartitionSpec(
             None,
         )
@@ -701,7 +733,7 @@ def test_forward(self):
         """
         self._setup_inputs()
 
-        args = [self.q, self.k, self.v, self.bias, self.mask, self.dropout_rng]
+        args = [self.q, self.k, self.v, self.bias, self.softmax_offset, self.mask, self.dropout_rng]
 
         customcall_args = [
             # Put test data onto each GPU for distributed.
@@ -711,6 +743,7 @@ def test_forward(self):
             jax.device_put(self.cp_reorder_fn(self.k), self.qkvo_sharding),
             jax.device_put(self.cp_reorder_fn(self.v), self.qkvo_sharding),
             jax.device_put(self.bias, self.bias_sharding),
+            jax.device_put(self.softmax_offset, self.softmax_offset_sharding),
             jax.device_put(self.sequence_desciptor, self.seq_desc_sharding),
             jax.device_put(self.dropout_rng, self.dropout_rng_sharding),
         ]
@@ -736,6 +769,7 @@ def test_forward(self):
                 self.qkvo_sharding,
                 self.qkvo_sharding,
                 self.bias_sharding,
+                self.softmax_offset_sharding,
                 self.seq_desc_sharding,
                 self.dropout_rng_sharding,
             ],
@@ -796,14 +830,15 @@ def grad_func(func, *args, cp_reverse_out=False, **kwargs):
                 jnp.mean(ret_valid.astype(jnp.float32), dtype=jnp.float32) * gradient_multiplier
             ).astype(self.dtype)
 
-        args = [self.q, self.k, self.v, self.bias, self.mask, self.dropout_rng]
+        args = [self.q, self.k, self.v, self.bias, self.softmax_offset, self.mask, self.dropout_rng]
         customcall_args = [
             # TODO(mgoldfarb-nvidia): We will need to add reordering for bias, mas and
             # THD params once we support those features on CP.
             jax.device_put(self.cp_reorder_fn(self.q), self.qkvo_sharding),
             jax.device_put(self.cp_reorder_fn(self.k), self.qkvo_sharding),
             jax.device_put(self.cp_reorder_fn(self.v), self.qkvo_sharding),
             jax.device_put(self.bias, self.bias_sharding),
+            jax.device_put(self.softmax_offset, self.softmax_offset_sharding),
             jax.device_put(self.sequence_desciptor, self.seq_desc_sharding),
             jax.device_put(self.dropout_rng, self.dropout_rng_sharding),
         ]
@@ -822,6 +857,7 @@ def grad_func(func, *args, cp_reverse_out=False, **kwargs):
         }
 
         # We can compute dBias only for the [1, h, s, s] layout
+        # arg positions: q=0, k=1, v=2, bias=3, softmax_offset=4
         if self.bias_shape == BiasShape._1HSS:
             arg_nums = (0, 1, 2, 3)
             grad_shardings = (
@@ -837,8 +873,8 @@ def grad_func(func, *args, cp_reverse_out=False, **kwargs):
         # Use FP16/BF16 to sum the results may cause overflow, use FP32 for the summation
         jitted_primitive = jit(
             value_and_grad(
-                lambda q, k, v, bias, *args: grad_func(
-                    customcall_fused_dpa, q, k, v, bias, *args, cp_reverse_out=True, **kwargs
+                lambda q, k, v, bias, softmax_offset, *args: grad_func(
+                    customcall_fused_dpa, q, k, v, bias, softmax_offset, *args, cp_reverse_out=True, **kwargs
                 ),
                 arg_nums,
             ),
@@ -847,14 +883,15 @@ def grad_func(func, *args, cp_reverse_out=False, **kwargs):
                 self.qkvo_sharding,
                 self.qkvo_sharding,
                 self.bias_sharding,
+                self.softmax_offset_sharding,
                 self.seq_desc_sharding,
                 self.dropout_rng_sharding,
             ),
             out_shardings=(None, grad_shardings),
         )
         jitted_reference = jit(
             value_and_grad(
-                lambda q, k, v, bias, *args: grad_func(jax_dpa, q, k, v, bias, *args, **kwargs),
+                lambda q, k, v, bias, softmax_offset, *args: grad_func(jax_dpa, q, k, v, bias, softmax_offset, *args, **kwargs),
                 arg_nums,
             )
         )
@@ -1097,6 +1134,7 @@ def _test_forward(
             seq_desc_format,
         )
         runner.test_forward()
+        
 
     @staticmethod
     @pytest.mark.parametrize(
@@ -1150,3 +1188,4 @@ def test_backward(
             seq_desc_format,
         )
         runner.test_backward()
+    
diff --git a/transformer_engine/jax/attention.py b/transformer_engine/jax/attention.py
@@ -706,6 +706,7 @@ def _legacy_fused_attn(
     context_parallel_strategy: CPStrategy = CPStrategy.DEFAULT,
     context_parallel_causal_load_balanced: bool = False,
     context_parallel_axis: str = "",
+    softmax_offset: Optional[jnp.ndarray] = None,
 ):
     """
     Perform non-THD (non-packed) cuDNN fused attention.
@@ -777,7 +778,7 @@ def _legacy_fused_attn(
     output = _fused_attn(
         qkv,
         bias,
-        None,
+        softmax_offset,
         SequenceDescriptor.from_seqlens((q_seq_lens, kv_seq_lens)),
         seed,
         attn_bias_type=attn_bias_type,
@@ -816,6 +817,7 @@ def fused_attn_thd(
     context_parallel_strategy: CPStrategy = CPStrategy.DEFAULT,
     context_parallel_causal_load_balanced: bool = False,
     context_parallel_axis: str = "",
+    softmax_offset: Optional[jnp.ndarray] = None,
 ):
     """
     Deprecated THD fused attn, please use fusd_attn with SequenceDescriptor
@@ -853,7 +855,7 @@ def fused_attn_thd(
     output = _fused_attn(
         qkv,
         bias,
-        None,
+        softmax_offset,
         SequenceDescriptor.from_seqlens_and_offsets(
             (q_seq_lens, kv_seq_lens), (q_seq_offsets, kv_seq_offsets)
         ),
@@ -1023,6 +1025,8 @@ def _fused_attn_bwd_rule(
     )
     if attn_bias_type == AttnBiasType.NO_BIAS:
         grad_bias = None
+    if softmax_type != AttnSoftmaxType.LEARNABLE_SOFTMAX:
+        grad_softmax_offset = None
     return (
         grad_qkv,
         grad_bias,
@@ -1053,6 +1057,7 @@ def fused_attn(
     context_parallel_causal_load_balanced: bool = False,
     context_parallel_axis: str = "",
     context_checkpoint_name: str = "context",
+    softmax_offset: Optional[jnp.ndarray] = None,
 ):
     """
     Perform cuDNN fused attention.
@@ -1087,6 +1092,9 @@ def fused_attn(
             Indicates the sequences are ordered for causal mask load balancing when running context parallelism.
         context_parallel_axis (str): The name of the context parallel axis.
         context_checkpoint_name (str): The name of the context checkpoint for the custom VJP forward pass.
+        softmax_offset (Optional[jnp.ndarray]): An optional learnable softmax offset tensor with shape
+            [1, num_heads, 1, 1]. Used when softmax_type is AttnSoftmaxType.LEARNABLE_SOFTMAX.
+            If provided, this parameter will receive gradients during backpropagation.
     Returns:
         (jnp.ndarray): The output tensor from the fused attention.
 
@@ -1143,15 +1151,18 @@ def fused_attn(
             context_parallel_strategy=context_parallel_strategy,
             context_parallel_causal_load_balanced=context_parallel_causal_load_balanced,
             context_parallel_axis=context_parallel_axis,
+            softmax_offset=softmax_offset,
         )
     output = _fused_attn(
         qkv,
         bias,
+        softmax_offset,
         sequence_descriptor,
         seed,
         attn_bias_type=attn_bias_type,
         attn_mask_type=attn_mask_type,
         qkv_layout=qkv_layout,
+        softmax_type=softmax_type,
         scaling_factor=scaling_factor,
         dropout_probability=dropout_probability,
         is_training=is_training,
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py
@@ -813,9 +813,16 @@ def abstract(
             shape=wkspace_shape, dtype=te_dtype_to_jax_dtype(wkspace_dtype)
         )
 
-        dsoftmax_offset_aval = q_aval.update(
-            shape=softmax_offset_aval.shape, dtype=softmax_offset_aval.dtype
-        )
+        # dsoftmax_offset should always have shape [1, attn_heads, 1, 1] when softmax_type is not VANILLA_SOFTMAX
+        # This matches the cuDNN graph requirements and PyTorch implementation
+        if config.softmax_type == AttnSoftmaxType.VANILLA_SOFTMAX:
+            dsoftmax_offset_aval = q_aval.update(
+                shape=softmax_offset_aval.shape, dtype=softmax_offset_aval.dtype
+            )
+        else:
+            dsoftmax_offset_aval = q_aval.update(
+                shape=(1, attn_heads, 1, 1), dtype=jnp.float32
+            )
 
         return dq_aval, dk_aval, dv_aval, dbias_aval, dsoftmax_offset_aval, wkspace_aval
 
@@ -2664,9 +2671,15 @@ def fused_attn_fwd(
     if softmax_offset is None:
         assert softmax_type != AttnSoftmaxType.LEARNABLE_SOFTMAX, f"Unknown {softmax_type=}"
         if softmax_type == AttnSoftmaxType.OFF_BY_ONE_SOFTMAX:
-            raise NotImplementedError(
-                "Off-by-one softmax is not supported when softmax_offset is None"
-            )
+            # Extract number of heads from qkv shape  
+            # For qkvpacked (BS3HD): shape is (..., seq, 3, heads, dim) → index -2
+            # For separate/kvpacked (BSHD): shape is (..., seq, heads, dim) → index -2
+            if qkv_layout.is_qkvpacked():
+                num_heads = qkv[0].shape[-2]  # heads is at index -2 for BS3HD
+            else:
+                num_heads = qkv[0].shape[-2]  # heads is at index -2 for BSHD
+            # Create properly-sized tensor [1, h, 1, 1] filled with zeros
+            softmax_offset = jnp.zeros((1, num_heads, 1, 1), dtype=jnp.float32)
         elif softmax_type == AttnSoftmaxType.VANILLA_SOFTMAX:
             softmax_offset = jnp.zeros(0, dtype=qkv[0].dtype)
         else:
@@ -2803,9 +2816,15 @@ def fused_attn_bwd(
     if softmax_offset is None:
         assert softmax_type != AttnSoftmaxType.LEARNABLE_SOFTMAX, f"Unknown {softmax_type=}"
         if softmax_type == AttnSoftmaxType.OFF_BY_ONE_SOFTMAX:
-            raise NotImplementedError(
-                "Off-by-one softmax is not supported when softmax_offset is None"
-            )
+            # Extract number of heads from qkv shape  
+            # For qkvpacked (BS3HD): shape is (..., seq, 3, heads, dim) → index -2
+            # For separate/kvpacked (BSHD): shape is (..., seq, heads, dim) → index -2
+            if qkv_layout.is_qkvpacked():
+                num_heads = qkv[0].shape[-2]  # heads is at index -2 for BS3HD
+            else:
+                num_heads = qkv[0].shape[-2]  # heads is at index -2 for BSHD
+            # Create properly-sized tensor [1, h, 1, 1] filled with zeros
+            softmax_offset = jnp.zeros((1, num_heads, 1, 1), dtype=jnp.float32)
         elif softmax_type == AttnSoftmaxType.VANILLA_SOFTMAX:
             softmax_offset = jnp.zeros(0, dtype=qkv[0].dtype)
         else:
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp