[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 547bf11f5bc2 · 2025-11-13T23:23:24.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/jax/test_distributed_fused_attn.py b/tests/jax/test_distributed_fused_attn.py
@@ -280,7 +280,7 @@ def test_cross_attn(
 
 DISTRIBUTED_CONTEXT_SELF_ATTN_DATA_SHAPES = [
     # Sequence lengths will be scaled by CP*2 so that we don't run with tiny sizes.
-    #TODO: Change the id to CPx2
+    # TODO: Change the id to CPx2
     pytest.param([2, 128, 8, 128], id="2-128xCP-8-128"),
     pytest.param([4, 256, 16, 64], id="4-256xCP-16-64"),
     # KL test code
@@ -362,7 +362,7 @@ def impl_test_context_parallel_attn(
             mesh_resource=mesh_resource,
             cp_strategy=cp_strategy,
             cp_load_balanced=load_balanced,
-            stripe_height=stripe_height
+            stripe_height=stripe_height,
         )
 
         def check_has_backend_for_mask(mask_type):
@@ -401,8 +401,8 @@ def check_has_backend_for_mask(mask_type):
         if num_head % kv_groups != 0 or (num_head // kv_groups) % tp_size != 0:
             pytest.skip(f"Skipping {kv_groups=} not multiple of {data_shape=} or {tp_size=}")
 
-        #KL code
-        #runner.test_backward()
+        # KL code
+        # runner.test_backward()
         runner.test_forward()
         del os.environ["NVTE_FUSED_RING_ATTENTION_USE_SCAN"]
 
@@ -602,6 +602,7 @@ def test_context_parallel_ring_attn_shardy(
     pytest.param(ReorderStrategy.Striped, 4, id="Striped-4"),
 ]
 
+
 class TestReorderCausalLoadBalancing:
     @pytest.mark.parametrize("cp_size", [2, 4, 8])
     @pytest_parametrize_wrapper("shape", REORDER_CAUSAL_LOAD_BALANCING_DATA_SHAPES)
@@ -619,10 +620,9 @@ def test(self, cp_size, shape, qkv_format, reorder_strategy, stripe_height):
 
         if reorder_strategy == ReorderStrategy.Striped:
             seq_lens = shape[seq_dim]
-            if seq_lens < (cp_size*stripe_height):
+            if seq_lens < (cp_size * stripe_height):
                 pytest.skip(f"{seq_lens=} must be larger than {cp_size*stripe_height=}")
 
-
         ref = tensor.copy()
 
         reorder = jax.jit(reorder_causal_load_balancing, static_argnums=[1, 2, 3, 4])
diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
@@ -474,12 +474,16 @@ def _setup_inputs(self):
         token_numbers_k = range(self.max_seqlen_kv)
         for batch_idx in range(q_shape[0]):
             for token_idx in token_numbers_q:
-                q_np[batch_idx][token_idx][0] = np.ones(self.head_dim_qk, self.dtype) * (token_idx + 1)
+                q_np[batch_idx][token_idx][0] = np.ones(self.head_dim_qk, self.dtype) * (
+                    token_idx + 1
+                )
             for token_idx in token_numbers_k:
-                k_np[batch_idx][token_idx][0] = np.ones(self.head_dim_qk, self.dtype) * np.sqrt(self.head_dim_qk)
+                k_np[batch_idx][token_idx][0] = np.ones(self.head_dim_qk, self.dtype) * np.sqrt(
+                    self.head_dim_qk
+                )
             v_np = np.ones(v_shape, self.dtype)
             # Set cols at multiples
-            v_np[0,::4, 0, :] = np.arange(v_np.shape[3])
+            v_np[0, ::4, 0, :] = np.arange(v_np.shape[3])
             self.q = jnp.array(q_np)
             self.k = jnp.array(k_np)
             self.v = jnp.array(v_np)
@@ -541,7 +545,7 @@ def generate_random_segment_ids(
                     min_segment_size = 1
                     if min_segment_len is not None:
                         min_segment_size = min_segment_len[i][seg_id]
-                    #KL test code
+                    # KL test code
                     min_segment_size = 4
                     segment_size = rng.integers(min_segment_size, max_segment_size + 1)
                     if current_pos + segment_size > sequence_length:
@@ -598,8 +602,16 @@ def generate_random_segment_ids(
             )
             self.segment_pos_q = self.segment_pos_kv = None
             self.seqlens_q = self.seqlens_kv = self.offsets_q = self.offsets_kv = None
-        print(f"self.segment_ids_q: {self.segment_ids_q}, \n self.segment_pos_q: {self.segment_pos_q}, \n self.pad_q: {self.pad_q}, \n self.seqlens_q: {self.seqlens_q}, \n self.offsets_q: { self.offsets_q} \n")
-        print(f"self.segment_ids_kv: {self.segment_ids_kv}, \n self.segment_pos_kv: {self.segment_pos_kv}, \n self.pad_kv: {self.pad_kv}, \n self.seqlens_kv: {self.seqlens_kv}, \n self.offsets_kv: { self.offsets_kv} \n")
+        print(
+            f"self.segment_ids_q: {self.segment_ids_q}, \n self.segment_pos_q:"
+            f" {self.segment_pos_q}, \n self.pad_q: {self.pad_q}, \n self.seqlens_q:"
+            f" {self.seqlens_q}, \n self.offsets_q: { self.offsets_q} \n"
+        )
+        print(
+            f"self.segment_ids_kv: {self.segment_ids_kv}, \n self.segment_pos_kv:"
+            f" {self.segment_pos_kv}, \n self.pad_kv: {self.pad_kv}, \n self.seqlens_kv:"
+            f" {self.seqlens_kv}, \n self.offsets_kv: { self.offsets_kv} \n"
+        )
 
         # For reference code
         self.mask = make_mask(
@@ -612,6 +624,7 @@ def generate_random_segment_ids(
         )
         # KL tet code
         import sys
+
         with np.printoptions(threshold=sys.maxsize):
             print(f"self.mask: \n {self.mask}")
 
@@ -876,7 +889,7 @@ def grad_func(func, *args, cp_reverse_out=False, **kwargs):
             "window_size": self.window_size,
             "context_parallel_strategy": self.cp_strategy,
             "context_parallel_causal_load_balanced": self.cp_load_balanced,
-            #"stripe_height": self.stripe_height,
+            # "stripe_height": self.stripe_height,
         }
 
         # We can compute dBias only for the [1, h, s, s] layout
diff --git a/tests/jax/utils.py b/tests/jax/utils.py
@@ -1462,11 +1462,12 @@ def assert_allclose(
         desired = desired.astype(jnp.float32)
     # KL test code
     import sys
+
     mismatch_counter = 0
     has_nonzero = jnp.any(actual != 0)
     print(f"has_nonzero: {has_nonzero}")
     with np.printoptions(threshold=sys.maxsize):
-        mismatch_mask = ~np.isclose(actual, desired, **tols) # True means mismatch
+        mismatch_mask = ~np.isclose(actual, desired, **tols)  # True means mismatch
         diff_indices = np.argwhere(mismatch_mask)
         for idx in diff_indices:
             idx_tuple = tuple(idx)
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -49,7 +49,9 @@
 namespace transformer_engine {
 namespace fused_attn {
 template <typename T>
-__global__ void print_tensor_elements_2(const T *const data, const size_t rows, const size_t start_cols, const size_t end_cols, const size_t cols) {
+__global__ void print_tensor_elements_2(const T *const data, const size_t rows,
+                                        const size_t start_cols, const size_t end_cols,
+                                        const size_t cols) {
   if ((threadIdx.x == 0) && (threadIdx.y == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
     for (size_t i = 0; i < rows; ++i) {
       for (size_t j = start_cols; j < end_cols; ++j) {
@@ -487,47 +489,47 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
           static_cast<const int32_t *>(devPtrCuSeqlensKV), static_cast<int32_t *>(devActualSeqlenQ),
           static_cast<int32_t *>(devActualSeqlenKV));
       NVTE_CHECK_CUDA(cudaGetLastError());
-      std::cout << "print_tensors: " << print_tensors << 
-                   "print_tensors_custom_mask: "
-                << print_tensors_custom_mask << std::endl;
-      if (print_tensors)
-      {
-        if(devPtrCuSeqlensQ) {
-          if(print_tensors_custom_mask)
-          {
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devPtrCuSeqlensQ), 1, 0, 8, /*does not matter for single row*/ actual_b);
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devPtrCuSeqlensQ), 1,
-                                                        1024, 1032,
-                                                       /*does not matter for single row*/ actual_b);
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devPtrCuSeqlensQ), 1,
-                                                        8184, 8192,
-                                                       /*does not matter for single row*/ actual_b);
-          }
-          else
-          {
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devPtrCuSeqlensQ), 1, 0, actual_b, /*does not matter for single row*/actual_b);
+      std::cout << "print_tensors: " << print_tensors
+                << "print_tensors_custom_mask: " << print_tensors_custom_mask << std::endl;
+      if (print_tensors) {
+        if (devPtrCuSeqlensQ) {
+          if (print_tensors_custom_mask) {
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devPtrCuSeqlensQ), 1, 0, 8,
+                /*does not matter for single row*/ actual_b);
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devPtrCuSeqlensQ), 1, 1024, 1032,
+                /*does not matter for single row*/ actual_b);
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devPtrCuSeqlensQ), 1, 8184, 8192,
+                /*does not matter for single row*/ actual_b);
+          } else {
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devPtrCuSeqlensQ), 1, 0, actual_b,
+                /*does not matter for single row*/ actual_b);
             cudaDeviceSynchronize();
           }
         }
         if (devActualSeqlenQ) {
-          if (print_tensors_custom_mask)
-          {
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devActualSeqlenQ), 1, 0, 8, /*does not matter for single row*/actual_b);
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devActualSeqlenQ), 1,
-                                                       1024, 1032,
-                                                       /*does not matter for single row*/ actual_b);
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devActualSeqlenQ), 1,
-                                                       8184, 8192,
-                                                       /*does not matter for single row*/ actual_b); 
-          }
-          else {
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devActualSeqlenQ), 1, 0, actual_b, /*does not matter for single row*/actual_b);
+          if (print_tensors_custom_mask) {
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devActualSeqlenQ), 1, 0, 8,
+                /*does not matter for single row*/ actual_b);
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devActualSeqlenQ), 1, 1024, 1032,
+                /*does not matter for single row*/ actual_b);
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devActualSeqlenQ), 1, 8184, 8192,
+                /*does not matter for single row*/ actual_b);
+          } else {
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devActualSeqlenQ), 1, 0, actual_b,
+                /*does not matter for single row*/ actual_b);
             cudaDeviceSynchronize();
           }
         }
-        if(devPtrCuSeqlensKV) {
-          if(print_tensors_custom_mask)
-          {
+        if (devPtrCuSeqlensKV) {
+          if (print_tensors_custom_mask) {
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int32_t *>(devPtrCuSeqlensKV), 1, 0, 8,
                 /*does not matter for single row*/ actual_b);
@@ -537,15 +539,15 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int32_t *>(devPtrCuSeqlensKV), 1, 8184, 8192,
                 /*does not matter for single row*/ actual_b);
-          }
-          else {
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devPtrCuSeqlensKV), 1, 0, actual_b, /*does not matter for single row*/ actual_b);
+          } else {
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devPtrCuSeqlensKV), 1, 0, actual_b,
+                /*does not matter for single row*/ actual_b);
             cudaDeviceSynchronize();
           }
         }
-        if(devActualSeqlenKV) {
-          if (print_tensors_custom_mask)
-          {
+        if (devActualSeqlenKV) {
+          if (print_tensors_custom_mask) {
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int32_t *>(devActualSeqlenKV), 1, 0, 8,
                 /*does not matter for single row*/ actual_b);
@@ -555,10 +557,10 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int32_t *>(devActualSeqlenKV), 1, 8184, 8192,
                 /*does not matter for single row*/ actual_b);
-          }
-          else
-          {
-            print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devActualSeqlenKV), 1, 0, actual_b, /*does not matter for single row*/ actual_b);
+          } else {
+            print_tensor_elements_2<<<1, 1, 0, stream>>>(
+                static_cast<int32_t *>(devActualSeqlenKV), 1, 0, actual_b,
+                /*does not matter for single row*/ actual_b);
             cudaDeviceSynchronize();
           }
         }
@@ -677,18 +679,18 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
           }
         }
       }
-        if (is_ragged_q) {
-          variant_pack[offset_q] = devOffsetsQ;
-          variant_pack[offset_o] = devOffsetsO;
-        }
-        if (is_ragged_kv) {
-          variant_pack[offset_k] = devOffsetsK;
-          variant_pack[offset_v] = devOffsetsV;
-        }
-        if (is_ragged_q && cudnn_runtime_version >= 90600) {
-          variant_pack[offset_stats] = devOffsetsS;
-        }
+      if (is_ragged_q) {
+        variant_pack[offset_q] = devOffsetsQ;
+        variant_pack[offset_o] = devOffsetsO;
+      }
+      if (is_ragged_kv) {
+        variant_pack[offset_k] = devOffsetsK;
+        variant_pack[offset_v] = devOffsetsV;
+      }
+      if (is_ragged_q && cudnn_runtime_version >= 90600) {
+        variant_pack[offset_stats] = devOffsetsS;
       }
+    }
 
     if (is_dropout) {
       variant_pack[dropout_seed] = devPtrDropoutSeed;
diff --git a/transformer_engine/jax/attention.py b/transformer_engine/jax/attention.py
@@ -353,7 +353,9 @@ def _obtain_batch_and_max_seqlen(qkv, qkv_layout):
     return batch, q_max_seqlen, kv_max_seqlen
 
 
-def reorder_causal_load_balancing(tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int, stripe_height: int = 1):
+def reorder_causal_load_balancing(
+    tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int, stripe_height: int = 1
+):
     """Reorders a tensor for load balancing the compute of causal attention."""
     if strategy == ReorderStrategy.DualChunkSwap:
         return tex.attention.reorder_causal_dual_chunk_swap(tensor, cp_size, seq_dim, False)
@@ -363,7 +365,7 @@ def reorder_causal_load_balancing(tensor, strategy: ReorderStrategy, cp_size: in
 
 
 def inverse_reorder_causal_load_balancing(
-    tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int, stripe_height: int = 1 
+    tensor, strategy: ReorderStrategy, cp_size: int, seq_dim: int, stripe_height: int = 1
 ):
     """Inverse operation of `reorder_causal_load_balancing`."""
     if strategy == ReorderStrategy.DualChunkSwap:
@@ -498,7 +500,7 @@ def _segment_ids_pos_to_seqlens_offsets(
     # This fast path avoids expanding the mask to Q * KV matrix and instead allows us to
     # examine only O(Q+KV) elements.
     # TODO(KshitijLakhani): Try exercising the fast path for BRCM as well
-    #TODO: Un comment the fast path
+    # TODO: Un comment the fast path
     # if (attn_mask_type.is_causal() and window_size is None) or (
     #     window_size == (-1, -1) and not attn_mask_type.is_bottom_right()
     # ):
@@ -517,7 +519,7 @@ def _segment_ids_pos_to_seqlens_offsets(
         segment_ids_kv,
         lambda x, y: jnp.equal(x, y) * x,
     )
-    #jax.debug.breakpoint()
+    # jax.debug.breakpoint()
     # TE JAX Attn expects the THD segments to have q_token <= kv_tokens so that a correct cross-attn type BRCM can be applied
     attn_mask = segment_mask
     if attn_mask_type.is_bottom_right():
@@ -579,7 +581,7 @@ def _segment_ids_pos_to_seqlens_offsets(
     q_seqlen, q_offset, kv_seqlen, kv_offset = _mask_to_seqlens_offset(
         attn_mask_with_id, max_segments_per_seq
     )
-    #jax.debug.breakpoint()
+    # jax.debug.breakpoint()
     return q_seqlen, kv_seqlen, q_offset, kv_offset
 
 
@@ -659,7 +661,7 @@ def get_seqlens_and_offsets(
                 window_size,
                 max_segments_per_seq,
             )
-            #jax.debug.breakpoint()
+            # jax.debug.breakpoint()
         else:
             q_seqlens, kv_seqlens = _segment_ids_to_seqlens(
                 q_segment_ids,
@@ -1038,7 +1040,7 @@ def _fused_attn_fwd_rule(
         context_parallel_strategy=context_parallel_strategy,
         context_parallel_causal_load_balanced=context_parallel_causal_load_balanced,
         context_parallel_axis=context_parallel_axis,
-        stripe_height=stripe_height
+        stripe_height=stripe_height,
     )
     output = checkpoint_name(output, context_checkpoint_name)
     softmax_aux = checkpoint_name(softmax_aux, context_checkpoint_name)
@@ -1162,7 +1164,7 @@ def fused_attn(
             Indicates the sequences are ordered for causal mask load balancing when running context parallelism.
         context_parallel_axis (str): The name of the context parallel axis.
         context_checkpoint_name (str): The name of the context checkpoint for the custom VJP forward pass.
-        stripe_height (int): 
+        stripe_height (int):
             Indicates the striping height to be used when using ReorderStrategy.Striped.
             Currently, a stripe_height > 1 is only allowed for CP + THD + Striped + AG
             0 indicates no striping strategy
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py