diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
index d36ce63d8e..83bf59bc0e 100644
--- a/fbgemm_gpu/bench/bench_utils.py
+++ b/fbgemm_gpu/bench/bench_utils.py
@@ -187,7 +187,8 @@ def benchmark_requests(
     # Run at least one warmup iteration to avoid the long cudaLaunchKernel time
     # for the first kernel if warmup_ms > 0
     # warmup_ms is prioritized over num_warmups
-    if (warmup_ms is None):
+
+    if warmup_ms is None:
         num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
 
     # warm-up the GPU before profiling
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index 019860e2f3..d59b658cbe 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -95,10 +95,7 @@
 logging.basicConfig(level=logging.DEBUG)
 
 
-def kineto_trace_profiler(
-    p: profile,
-    trace_info: tuple[str, str, str, str]
-) -> float:
+def kineto_trace_profiler(p: profile, trace_info: tuple[str, str, str, str]) -> float:
     phase, trace_url, tbe_type, kern_name = trace_info
     p.export_chrome_trace(
         trace_url.format(tbe_type=tbe_type, phase=phase, ospid=os.getpid())
@@ -1169,13 +1166,15 @@ def nbit_cpu(  # noqa C901
     default="{tbe_type}_tbe_{phase}_trace_{ospid}.json",
 )
 @click.option(
-    "--warmup-runs", default=2,
-    help="Number of warmup runs. Ignored if --warmup-ms is set.")
+    "--warmup-runs",
+    default=2,
+    help="Number of warmup runs. Ignored if --warmup-ms is set.",
+)
 @click.option(
     "--warmup-ms",
     type=int,
     default=None,
-    help="Warmup duration in milliseconds. Disables the --run-nums option."
+    help="Warmup duration in milliseconds. Disables the --run-nums option.",
 )
 def nbit_device(  # noqa C901
     alpha: float,
@@ -1393,7 +1392,7 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
                 indices.int(),
                 offsets.int(),
                 per_sample_weights,
-            )
+            ),
         )
 
     with context_factory(
@@ -1411,7 +1410,7 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
         )
 
     if export_trace:
-        kernel_time = time_dict['kernel_time']
+        kernel_time = time_dict["kernel_time"]
         bandwidth = read_write_bytes / kernel_time / 1.0e3
 
         logging.info(
@@ -1536,13 +1535,15 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
     default="{tbe_type}_tbe_spec_{phase}_trace_{ospid}.json",
 )
 @click.option(
-    "--warmup-runs", default=2,
-    help="Number of warmup runs. Ignored if --warmup-ms is set.")
+    "--warmup-runs",
+    default=2,
+    help="Number of warmup runs. Ignored if --warmup-ms is set.",
+)
 @click.option(
     "--warmup-ms",
     type=int,
     default=None,
-    help="Warmup duration in milliseconds. Disables the --run-nums option."
+    help="Warmup duration in milliseconds. Disables the --run-nums option.",
 )
 def nbit_device_with_spec(  # noqa C901
     alpha: float,
@@ -1760,11 +1761,11 @@ def nbit_device_with_spec(  # noqa C901
                     per_sample_weights,
                 ),
                 check_median=check_median,
-                warmup_ms=warmup_ms
+                warmup_ms=warmup_ms,
             )
 
         # copy the request of last iteration for kineto profile benchmark
-        if (i == runs_of_iters - 1):
+        if i == runs_of_iters - 1:
             kineto_request = requests
 
         # free up memory
@@ -1815,7 +1816,7 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
                     indices.int(),
                     offsets.int(),
                     per_sample_weights,
-                )
+                ),
             )
 
         with context_factory(
@@ -1833,7 +1834,7 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
             )
 
         if export_trace:
-            kernel_time = time_dict['kernel_time']
+            kernel_time = time_dict["kernel_time"]
             bandwidth = read_write_bytes / kernel_time / 1.0e3
 
             logging.info(