fix(fitness): resolve exception handling and timing instrumentation issues

deanq · deanq · commit 92eefbebbbef · 2026-01-03T14:35:57.000-08:00
- Simplify exception re-raising in GPU fitness checks to preserve exception context
- Replace time.time() with time.perf_counter() for consistent monotonic timing across all fitness checks
- Ensures exception chains are maintained and timing measurements are accurate
diff --git a/runpod/serverless/modules/rp_gpu_fitness.py b/runpod/serverless/modules/rp_gpu_fitness.py
@@ -166,10 +166,10 @@ async def _run_gpu_test_binary() -> Dict[str, Any]:
         raise RuntimeError(
             f"GPU test binary timed out after {TIMEOUT_SECONDS}s"
         ) from None
-    except FileNotFoundError as exc:
-        raise exc
-    except PermissionError as exc:
-        raise exc
+    except FileNotFoundError:
+        raise
+    except PermissionError:
+        raise
     except Exception as exc:
         raise RuntimeError(f"GPU test binary execution failed: {exc}") from exc
 
@@ -214,8 +214,8 @@ def _run_gpu_test_fallback() -> None:
         raise RuntimeError("nvidia-smi not found. Cannot validate GPU availability.") from None
     except subprocess.TimeoutExpired:
         raise RuntimeError("nvidia-smi timed out") from None
-    except Exception as exc:
-        raise exc
+    except RuntimeError:
+        raise
 
 
 async def _check_gpu_health() -> None:
diff --git a/runpod/serverless/modules/rp_system_fitness.py b/runpod/serverless/modules/rp_system_fitness.py
@@ -161,11 +161,11 @@ async def _check_network_connectivity() -> None:
     port = 53
 
     try:
-        start_time = time.time()
+        start_time = time.perf_counter()
         reader, writer = await asyncio.wait_for(
             asyncio.open_connection(host, port), timeout=NETWORK_CHECK_TIMEOUT
         )
-        elapsed_ms = (time.time() - start_time) * 1000
+        elapsed_ms = (time.perf_counter() - start_time) * 1000
         writer.close()
         await writer.wait_closed()
 
@@ -374,15 +374,15 @@ async def _check_gpu_compute_benchmark() -> None:
 
         # Create small matrix on GPU
         size = 1024
-        start_time = time.time()
+        start_time = time.perf_counter()
 
         # Do computation
         A = torch.randn(size, size, device="cuda")
         B = torch.randn(size, size, device="cuda")
         torch.matmul(A, B)
         torch.cuda.synchronize()  # Wait for GPU to finish
 
-        elapsed_ms = (time.time() - start_time) * 1000
+        elapsed_ms = (time.perf_counter() - start_time) * 1000
         max_ms = GPU_BENCHMARK_TIMEOUT * 1000
 
         if elapsed_ms > max_ms:
@@ -404,14 +404,14 @@ async def _check_gpu_compute_benchmark() -> None:
         import cupy as cp
 
         size = 1024
-        start_time = time.time()
+        start_time = time.perf_counter()
 
         A = cp.random.randn(size, size)
         B = cp.random.randn(size, size)
         cp.matmul(A, B)
         cp.cuda.Device().synchronize()
 
-        elapsed_ms = (time.time() - start_time) * 1000
+        elapsed_ms = (time.perf_counter() - start_time) * 1000
         max_ms = GPU_BENCHMARK_TIMEOUT * 1000
 
         if elapsed_ms > max_ms: