diff --git a/examples/utils.py b/examples/utils.py
index 8dd545f5..e954d074 100644
--- a/examples/utils.py
+++ b/examples/utils.py
@@ -94,11 +94,15 @@ def verbose_allclose(
     return []
 
 
-def clear_l2_cache():
-    # import cupy as cp
-    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
-    # create a large dummy tensor
-    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
-    # write stuff to it
-    dummy.fill_(42)
-    del dummy
+def clear_l2_cache(device='cuda'):
+    """
+    Clears GPU L2 cache by allocating and zeroing a buffer.
+
+    GB200 has 126 MB L2 cache. Using 512 MB (4x buffer).
+    See: https://docs.nvidia.com/cuda/blackwell-tuning-guide/
+
+    Adapted from triton.testing.do_bench.
+    """
+    cache_size = 512 * 1024 * 1024
+    cache = torch.empty(int(cache_size // 4), dtype=torch.int32, device=device)
+    cache.zero_()