[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit b8a172260e3a · 2025-11-06T01:40:46.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/pytorch/nvfp4/test_nvfp4_group_quantize.py b/tests/pytorch/nvfp4/test_nvfp4_group_quantize.py
@@ -25,6 +25,7 @@
 
 recipe_available, reason_for_no_recipe = te.is_nvfp4_available(return_reason=True)
 
+
 def generate_random_multiples_sum(total=8192, n=4, multiple=64):
     if total % multiple != 0:
         raise ValueError(f"Total ({total}) must be a multiple of {multiple}")
@@ -38,13 +39,14 @@ def generate_random_multiples_sum(total=8192, n=4, multiple=64):
     cuts = sorted(random.sample(range(1, total_units), n - 1))
 
     # convert to segment lengths
-    parts = [cuts[0]] + \
-            [cuts[i] - cuts[i-1] for i in range(1, len(cuts))] + \
-            [total_units - cuts[-1]]
+    parts = (
+        [cuts[0]] + [cuts[i] - cuts[i - 1] for i in range(1, len(cuts))] + [total_units - cuts[-1]]
+    )
 
     # convert back to multiples
     return [p * multiple for p in parts]
 
+
 def generate_split_sections(M: int, N: int, edge_cases: str) -> list[int]:
     least_multiple = 64
     num_chunks = 4
@@ -53,7 +55,7 @@ def generate_split_sections(M: int, N: int, edge_cases: str) -> list[int]:
     avg_split = M // num_chunks
 
     if M == 0 or N == 0:
-        # all zeros 
+        # all zeros
         return [0] * num_chunks
     if edge_cases == "regular":
         split_sections = [avg_split] * num_chunks
@@ -73,7 +75,9 @@ def generate_split_sections(M: int, N: int, edge_cases: str) -> list[int]:
 
     # make sure every split_section is a multiple of least_multiple
     for split_section in split_sections:
-        assert split_section % least_multiple == 0, "The split_sections are not multiples of least_multiple"
+        assert (
+            split_section % least_multiple == 0
+        ), "The split_sections are not multiples of least_multiple"
 
     return split_sections
 
@@ -175,8 +179,8 @@ def check_group_quantization_nvfp4_versus_reference(
         )
         for _ in range(len(split_sections))
     ]
-    x_qx_ref, x_sx_ref, x_amax_rowwise_ref, x_qx_t_ref, x_sx_t_ref, x_amax_colwise_ref = reference_group_quantize(
-        x, quantizers, split_sections, return_transpose
+    x_qx_ref, x_sx_ref, x_amax_rowwise_ref, x_qx_t_ref, x_sx_t_ref, x_amax_colwise_ref = (
+        reference_group_quantize(x, quantizers, split_sections, return_transpose)
     )
 
     split_quantize_outputs = tex.split_quantize(x, split_sections, quantizers)
@@ -195,27 +199,31 @@ def check_group_quantization_nvfp4_versus_reference(
             torch.testing.assert_close(x_amax_rowwise[i], x_amax_rowwise_ref[i], atol=0.0, rtol=0.0)
             torch.testing.assert_close(x_qx[i], x_qx_ref[i], atol=0.0, rtol=0.0)
             valid_scale_shape = get_nvfp4_scale_shape_no_padding(x_splits[i].shape, False)
-            x_sx_valid = x_sx[i][:valid_scale_shape[0], :valid_scale_shape[1]]
-            x_sx_ref_valid = x_sx_ref[i][:valid_scale_shape[0], :valid_scale_shape[1]]
+            x_sx_valid = x_sx[i][: valid_scale_shape[0], : valid_scale_shape[1]]
+            x_sx_ref_valid = x_sx_ref[i][: valid_scale_shape[0], : valid_scale_shape[1]]
             torch.testing.assert_close(x_sx_valid, x_sx_ref_valid, atol=0.0, rtol=0.0)
-    
+
     if return_transpose:
-        x_qx_t = [output._columnwise_data.view(dtype=torch.uint8) for output in split_quantize_outputs]
+        x_qx_t = [
+            output._columnwise_data.view(dtype=torch.uint8) for output in split_quantize_outputs
+        ]
         x_sx_t = [output._columnwise_scale_inv for output in split_quantize_outputs]
         x_amax_colwise = [output._amax_columnwise for output in split_quantize_outputs]
-        # assert with zero tolerance 
+        # assert with zero tolerance
         for i in range(len(x_qx_t)):
             if split_sections[i] == 0:
                 # then just assert the same same and dtype because the buffer won't be zero out
                 assert_same_shape_and_dtype(x_amax_colwise[i], x_amax_colwise_ref[i])
                 assert_same_shape_and_dtype(x_qx_t[i], x_qx_t_ref[i])
                 assert_same_shape_and_dtype(x_sx_t[i], x_sx_t_ref[i])
-            else:   
-                torch.testing.assert_close(x_amax_colwise[i], x_amax_colwise_ref[i], atol=0.0, rtol=0.0)
+            else:
+                torch.testing.assert_close(
+                    x_amax_colwise[i], x_amax_colwise_ref[i], atol=0.0, rtol=0.0
+                )
                 torch.testing.assert_close(x_qx_t[i], x_qx_t_ref[i], atol=0.0, rtol=0.0)
                 valid_scale_shape = get_nvfp4_scale_shape_no_padding(x_splits[i].shape, True)
-                x_sx_t_valid = x_sx_t[i][:valid_scale_shape[0], :valid_scale_shape[1]]
-                x_sx_t_ref_valid = x_sx_t_ref[i][:valid_scale_shape[0], :valid_scale_shape[1]]
+                x_sx_t_valid = x_sx_t[i][: valid_scale_shape[0], : valid_scale_shape[1]]
+                x_sx_t_ref_valid = x_sx_t_ref[i][: valid_scale_shape[0], : valid_scale_shape[1]]
                 torch.testing.assert_close(x_sx_t_valid, x_sx_t_ref_valid, atol=0.0, rtol=0.0)
 
 
@@ -234,7 +242,14 @@ def check_group_quantization_nvfp4_versus_reference(
 )
 @pytest.mark.parametrize("x_dtype", [torch.bfloat16], ids=str)
 @pytest.mark.parametrize(
-    "edge_cases", ["regular", "zero_tokens_front", "zero_tokens_end", "zero_tokens_middle", "random_uneven_split"]
+    "edge_cases",
+    [
+        "regular",
+        "zero_tokens_front",
+        "zero_tokens_end",
+        "zero_tokens_middle",
+        "random_uneven_split",
+    ],
 )
 @pytest.mark.parametrize(
     "return_transpose", [True, False], ids=["quantize_transpose", "skip_transpose"]
@@ -263,5 +278,3 @@ def test_rht_with_quantization_block_tiling_versus_reference(
         with_post_rht_amax=True,
         with_random_sign_mask=with_random_sign_mask,
     )
-
-
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -12,7 +12,10 @@
 import torch.nn as nn
 from torch.nn import Parameter
 
-from transformer_engine.pytorch.quantization import FP8GlobalStateManager, get_align_size_for_quantization
+from transformer_engine.pytorch.quantization import (
+    FP8GlobalStateManager,
+    get_align_size_for_quantization,
+)
 from transformer_engine.pytorch.utils import (
     init_method_normal,
     scaled_init_method_normal,
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -146,7 +146,7 @@ void multi_tensor_quantize_nvfp4_impl(const TensorWrapper &input,
     nvte_tensor_input_list.push_back(input_list[i].data());
     nvte_tensor_output_list.push_back(output_list[i].data());
   }
-  
+
   // stochastic rounding support for multi tensor
   if (quantizer->stochastic_rounding) {
     // TODO: implement stochastic rounding support for multi tensor
@@ -160,29 +160,26 @@ void multi_tensor_quantize_nvfp4_impl(const TensorWrapper &input,
 
   // with or without RHT, use nvte_multi_hadamard_transform_amax
   // out.amax is the rowwise amax, out.columnwise_amax is the columnwise amax
-  // rowwise amax will be the amax of original amax(input) 
+  // rowwise amax will be the amax of original amax(input)
   // columnwise amax will be the amax of the amax(RHT(input.t))
   if (quantizer->with_rht) {
     // bf16 only for now
-    NVTE_CHECK(input.dtype() == DType::kBFloat16, "NVFP4 multi_quantize: RHT is only supported for bfloat16 input");
+    NVTE_CHECK(input.dtype() == DType::kBFloat16,
+               "NVFP4 multi_quantize: RHT is only supported for bfloat16 input");
     if (quantizer->with_post_rht_amax) {
       // We need:
       // 1. Rowwise amax = amax for input
       // 2. Columnwise amax = amax for RHT(input.t)
       NVTE_SCOPED_GIL_RELEASE({
         nvte_multi_hadamard_transform_amax(
-            input.data(),
-            reinterpret_cast<NVTETensor*>(nvte_tensor_output_list.data()),
-            split_sections.data(),
-            num_tensors,
-            0,
-            quantizer->rht_matrix_random_sign_mask_t,
+            input.data(), reinterpret_cast<NVTETensor *>(nvte_tensor_output_list.data()),
+            split_sections.data(), num_tensors, 0, quantizer->rht_matrix_random_sign_mask_t,
             stream);
       });
-    }else {
+    } else {
       NVTE_CHECK(false, "NVFP4 multi_quantize: Pre-RHT amax is not supported yet");
     }
-  }else {
+  } else {
     // TODO: implement this too when we disable RHT
     NVTE_CHECK(false, "NVFP4 multi_quantize: RHT is not supported when RHT is disabled for now");
   }
@@ -191,7 +188,7 @@ void multi_tensor_quantize_nvfp4_impl(const TensorWrapper &input,
   if (quantizer->with_rht) {
     // check the availablibilty of RHT matrix definition for best perf
     NVTE_CHECK(quantizer->rht_matrix.defined() && quantizer->rht_matrix.numel() > 0,
-                "NVFP4 multi_quantize: RHT matrix is not set");
+               "NVFP4 multi_quantize: RHT matrix is not set");
     auto rht_matrix_nvte = makeTransformerEngineTensor(quantizer->rht_matrix);
 
     NVTE_SCOPED_GIL_RELEASE({
@@ -211,12 +208,15 @@ void multi_tensor_quantize_nvfp4_impl(const TensorWrapper &input,
           out_identity.set_rowwise_scale_inv(out_identity_scale_inv.data_ptr,
                                              static_cast<DType>(out_identity_scale_inv.dtype),
                                              out_identity_scale_inv.shape);
-          out_identity.set_amax(out_identity_amax.data_ptr, static_cast<DType>(out_identity_amax.dtype),
+          out_identity.set_amax(out_identity_amax.data_ptr,
+                                static_cast<DType>(out_identity_amax.dtype),
                                 out_identity_amax.shape);
-    
-          NVTE_SCOPED_GIL_RELEASE(
-              { nvte_quantize_v2(input_list[i].data(), out_identity.data(), quant_config_list[i], stream); });
-        } 
+
+          NVTE_SCOPED_GIL_RELEASE({
+            nvte_quantize_v2(input_list[i].data(), out_identity.data(), quant_config_list[i],
+                             stream);
+          });
+        }
 
         // already eligible for RHT columnwise cast fusion after the dimension check
         if (quantizer->columnwise_usage) {
@@ -240,16 +240,17 @@ void multi_tensor_quantize_nvfp4_impl(const TensorWrapper &input,
           colwise_data_shape_2d.push_back(last_dim);
 
           out_transpose.set_rowwise_data(out_columnwise_data.data_ptr,
-                                        static_cast<DType>(out_columnwise_data.dtype),
-                                        colwise_data_shape_2d);
+                                         static_cast<DType>(out_columnwise_data.dtype),
+                                         colwise_data_shape_2d);
           out_transpose.set_rowwise_scale_inv(out_columnwise_scale_inv.data_ptr,
                                               static_cast<DType>(out_columnwise_scale_inv.dtype),
                                               out_columnwise_scale_inv.shape);
           out_transpose.set_amax(out_columnwise_amax.data_ptr,
-                                static_cast<DType>(out_columnwise_amax.dtype),
-                                out_columnwise_amax.shape);
-          nvte_hadamard_transform_cast_fusion_columnwise(
-                input_list[i].data(), out_transpose.data(), rht_matrix_nvte.data(), quant_config_list[i], stream);
+                                 static_cast<DType>(out_columnwise_amax.dtype),
+                                 out_columnwise_amax.shape);
+          nvte_hadamard_transform_cast_fusion_columnwise(input_list[i].data(), out_transpose.data(),
+                                                         rht_matrix_nvte.data(),
+                                                         quant_config_list[i], stream);
         }
       }
     });
@@ -264,7 +265,6 @@ void multi_tensor_quantize_nvfp4_impl(const TensorWrapper &input,
       }
     });
   }
-
 }
 
 void multi_tensor_quantize_impl(const TensorWrapper &single_input,
@@ -290,7 +290,7 @@ void multi_tensor_quantize_impl(const TensorWrapper &single_input,
 
   // check if split_sections is just a dummy input
   bool valid_split_sections = split_sections.size() == num_tensors;
-  
+
   // Check scaling mode consistency across all tensors
   for (size_t i = 0; i < num_tensors; i++) {
     if (detail::IsFloat8Quantizers(quantizer_py_list[i].ptr())) {
@@ -300,7 +300,7 @@ void multi_tensor_quantize_impl(const TensorWrapper &single_input,
         with_fused_kernel = false;
         break;
       }
-      // check if the scaling mode is fp8 delayed scaling for all quantizers 
+      // check if the scaling mode is fp8 delayed scaling for all quantizers
       if (scaling_mode != NVTE_DELAYED_TENSOR_SCALING) {
         with_fused_kernel = false;
         break;
@@ -317,12 +317,12 @@ void multi_tensor_quantize_impl(const TensorWrapper &single_input,
         if (split_sections[i] % 64 != 0) {
           with_fused_kernel = false;
           break;
-        } 
-      }else {
+        }
+      } else {
         with_fused_kernel = false;
         break;
       }
-      
+
     } else {
       with_fused_kernel = false;
       break;