[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit f35e5dd10b2e · 2025-09-15T12:00:38.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -2733,7 +2733,7 @@ def _run_module(m, inp):
 
 def test_fp8_weight_on_demand_transpose():
     if not fp8_block_scaling_available:
-      pytest.skip("blockwise fp8 not available.")
+        pytest.skip("blockwise fp8 not available.")
 
     dtype = torch.bfloat16
     num_gemms = 4
@@ -2758,7 +2758,9 @@ def test_fp8_weight_on_demand_transpose():
 
     # Share params
     with torch.no_grad():
-        weights_cache = [Parameter(getattr(grouped_linear, f"weight{i}").clone()) for i in range(num_gemms)]
+        weights_cache = [
+            Parameter(getattr(grouped_linear, f"weight{i}").clone()) for i in range(num_gemms)
+        ]
 
     for i in range(num_gemms):
         assert getattr(grouped_linear, f"weight{i}")._columnwise_data is not None
@@ -2811,7 +2813,6 @@ def test_fp8_weight_on_demand_transpose():
     for i, (o, o_ref) in enumerate(zip(outputs1, outputs2)):
         torch.testing.assert_close(o, o_ref, rtol=0, atol=0)
 
-
     # 2. layernorm linear module test
     FP8GlobalStateManager.FP8_BLOCKWISE_WEIGHT_ON_DEMAND_TRANSPOSE = False
     with fp8_model_init(enabled=True, recipe=fp8_recipe):
@@ -2831,9 +2832,7 @@ def test_fp8_weight_on_demand_transpose():
     # Share params
     weights_cache = []
     with torch.no_grad():
-        weights_cache.append(
-            te_ln_linear.te_module.layer_norm_weight.clone()
-        )
+        weights_cache.append(te_ln_linear.te_module.layer_norm_weight.clone())
         weights_cache.append(te_ln_linear.te_module.weight.clone())
     outputs1 = _test_granular_accuracy(te_ln_linear, bs, dtype, config, recipe=fp8_recipe)
 
@@ -2861,7 +2860,6 @@ def test_fp8_weight_on_demand_transpose():
     for i, (o, o_ref) in enumerate(zip(outputs1, outputs2)):
         torch.testing.assert_close(o, o_ref, rtol=0, atol=0)
 
-
     # 3. linear module test
     FP8GlobalStateManager.FP8_BLOCKWISE_WEIGHT_ON_DEMAND_TRANSPOSE = False
     with fp8_model_init(enabled=True, recipe=fp8_recipe):
@@ -2880,7 +2878,9 @@ def test_fp8_weight_on_demand_transpose():
     with torch.no_grad():
         weights_cache = te_linear.weight.clone()
 
-    te_outputs1 = _test_granular_accuracy(te_linear, bs, dtype, config, delay_wgrad_compute=True, recipe=fp8_recipe)
+    te_outputs1 = _test_granular_accuracy(
+        te_linear, bs, dtype, config, delay_wgrad_compute=True, recipe=fp8_recipe
+    )
 
     FP8GlobalStateManager.FP8_BLOCKWISE_WEIGHT_ON_DEMAND_TRANSPOSE = True
     with fp8_model_init(enabled=True, recipe=fp8_recipe):
@@ -2896,10 +2896,12 @@ def test_fp8_weight_on_demand_transpose():
     assert te_linear.weight._columnwise_data is None
 
     te_linear.weight = Parameter(weights_cache)
-    te_outputs2 = _test_granular_accuracy(te_linear, bs, dtype, config, delay_wgrad_compute=True, recipe=fp8_recipe)
+    te_outputs2 = _test_granular_accuracy(
+        te_linear, bs, dtype, config, delay_wgrad_compute=True, recipe=fp8_recipe
+    )
 
     # should be bit-wise match
     for i, (o, o_ref) in enumerate(zip(outputs1, outputs2)):
         torch.testing.assert_close(o, o_ref, rtol=0, atol=0)
 
-    FP8GlobalStateManager.FP8_BLOCKWISE_WEIGHT_ON_DEMAND_TRANSPOSE = old_value
+    FP8GlobalStateManager.FP8_BLOCKWISE_WEIGHT_ON_DEMAND_TRANSPOSE = old_value
diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py
@@ -113,12 +113,13 @@ def get_fp8_max(fp8_recipe: Recipe, fprop_tensor: bool = True) -> tex.DType:
         return Format.E4M3.value.max_fwd
     return Format.E5M2.value.max_fwd
 
+
 def _get_fp8_blockwise_weight_on_demand_transpose():
     # fp8 blockwise is not supported when sm >= 10.0
-    return (
-        int(os.getenv("NVTE_ON_DEMAND_FP8_WEIGHT_TRANSPOSE", "0")) > 0
-        and get_device_compute_capability() < (10, 0)
-    )
+    return int(
+        os.getenv("NVTE_ON_DEMAND_FP8_WEIGHT_TRANSPOSE", "0")
+    ) > 0 and get_device_compute_capability() < (10, 0)
+
 
 class FP8GlobalStateManager:
     """Class to keep track of and manipulate the global
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
@@ -1265,7 +1265,9 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
             # Wrap parameters in QuantizedTensor if needed
             fp8_meta_index = self.param_init_meta[name].fp8_meta_index
             high_precision_init_val = None
-            fp8_weight_on_demand_transpose = FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+            fp8_weight_on_demand_transpose = (
+                FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+            )
             if self.primary_weights_in_fp8 and fp8_meta_index is not None:
 
                 # Keep high-precision values on CPU if needed
@@ -1276,7 +1278,10 @@ def reset_parameters(self, defer_init: Optional[bool] = False) -> None:
                 quantizer = self.quantizers["scaling_fwd"][fp8_meta_index]
                 if quantizer is None:
                     raise RuntimeError("Weight quantizer has not been initialized")
-                quantizer.set_usage(rowwise=True, columnwise=torch.is_grad_enabled() and not fp8_weight_on_demand_transpose)
+                quantizer.set_usage(
+                    rowwise=True,
+                    columnwise=torch.is_grad_enabled() and not fp8_weight_on_demand_transpose,
+                )
                 quantizer.internal = False
 
                 # Quantize parameter
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -90,7 +90,9 @@ def forward(
         biases = weights_and_biases[num_gemms:]
         device = inp.device
         weight_requires_grad = weights[0].requires_grad
-        fp8_weight_on_demand_transpose = FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+        fp8_weight_on_demand_transpose = (
+            FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+        )
 
         # Configure quantizers
         if save_original_input and isinstance(input_quantizers[0], Float8Quantizer):
@@ -113,7 +115,7 @@ def forward(
                 for weight_quantizer in weight_quantizers:
                     weight_quantizer.set_usage(
                         rowwise=True,
-                        columnwise=columnwise_usage and not fp8_weight_on_demand_transpose
+                        columnwise=columnwise_usage and not fp8_weight_on_demand_transpose,
                     )
         if output_quantizers[0] is not None:
             for output_quantizer in output_quantizers:
@@ -212,7 +214,10 @@ def forward(
                 inputmats = [None] * num_gemms
             if inp.requires_grad:
                 for weight in weights_fp8:
-                    if isinstance(weight, QuantizedTensorBase) and not fp8_weight_on_demand_transpose:
+                    if (
+                        isinstance(weight, QuantizedTensorBase)
+                        and not fp8_weight_on_demand_transpose
+                    ):
                         weight.update_usage(columnwise_usage=True)
 
             tensors_to_save, tensor_objects = prepare_for_saving(
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -136,7 +136,9 @@ def forward(
         if ub_name is not None:
             nvtx_label = f"{nvtx_label}.{ub_name}"
 
-        fp8_weight_on_demand_transpose = FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+        fp8_weight_on_demand_transpose = (
+            FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+        )
 
         # Make sure input dimensions are compatible
         out_features, in_features = weight.shape
@@ -279,8 +281,7 @@ def forward(
             # Configure quantizer
             if weight_quantizer is not None:
                 weight_quantizer.set_usage(
-                    rowwise=True,
-                    columnwise=is_grad_enabled and not fp8_weight_on_demand_transpose
+                    rowwise=True, columnwise=is_grad_enabled and not fp8_weight_on_demand_transpose
                 )
 
             # Get quantized weight
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -129,7 +129,9 @@ def forward(
         out_features, in_features = weight.shape
         assert inp.shape[-1] == in_features, "GEMM not possible"
 
-        fp8_weight_on_demand_transpose = FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+        fp8_weight_on_demand_transpose = (
+            FP8GlobalStateManager.is_blockwise_fp8_weight_on_demand_transpose()
+        )
 
         # Configure tensor-parallel communication
         tp_world_size = get_distributed_world_size(tp_group)
@@ -245,8 +247,7 @@ def forward(
                         and not in_fp8_activation_recompute_phase()
                     )
                 weight_quantizer.set_usage(
-                    rowwise=True,
-                    columnwise=columnwise_usage and not fp8_weight_on_demand_transpose
+                    rowwise=True, columnwise=columnwise_usage and not fp8_weight_on_demand_transpose
                 )
 
             # Get quantized weight
@@ -384,7 +385,10 @@ def forward(
 
             # Weight with column-wise usage is needed for dgrad GEMM.
             if inp.requires_grad:
-                if isinstance(weightmat, QuantizedTensorBase) and not fp8_weight_on_demand_transpose:
+                if (
+                    isinstance(weightmat, QuantizedTensorBase)
+                    and not fp8_weight_on_demand_transpose
+                ):
                     weightmat.update_usage(columnwise_usage=True)
 
             if cpu_offloading and saved_inputmat is not None:
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py
@@ -803,9 +803,8 @@ def backward(
 def get_columnwise_fp8_tensor(rowwise_tensor, requires_grad=False):
     columnwise_scale_inv = rowwise_tensor._rowwise_scale_inv.transpose(-2, -1).contiguous()
     M, N = rowwise_tensor.shape
-    columnwise_data = torch.empty((N, M),
-        device=rowwise_tensor.device,
-        dtype=rowwise_tensor._rowwise_data.dtype
+    columnwise_data = torch.empty(
+        (N, M), device=rowwise_tensor.device, dtype=rowwise_tensor._rowwise_data.dtype
     )
     tex.fp8_transpose(rowwise_tensor._rowwise_data, rowwise_tensor._fp8_dtype, out=columnwise_data)
 
@@ -820,4 +819,4 @@ def get_columnwise_fp8_tensor(rowwise_tensor, requires_grad=False):
         quantizer=rowwise_tensor._quantizer,
         is_2D_scaled=True,
         requires_grad=requires_grad,
-    )
+    )