Skip to content

Commit 92eefbe

Browse files
committed
fix(fitness): resolve exception handling and timing instrumentation issues
- Simplify exception re-raising in GPU fitness checks to preserve exception context - Replace time.time() with time.perf_counter() for consistent monotonic timing across all fitness checks - Ensures exception chains are maintained and timing measurements are accurate
1 parent aef8849 commit 92eefbe

File tree

2 files changed

+12
-12
lines changed

2 files changed

+12
-12
lines changed

runpod/serverless/modules/rp_gpu_fitness.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,10 @@ async def _run_gpu_test_binary() -> Dict[str, Any]:
166166
raise RuntimeError(
167167
f"GPU test binary timed out after {TIMEOUT_SECONDS}s"
168168
) from None
169-
except FileNotFoundError as exc:
170-
raise exc
171-
except PermissionError as exc:
172-
raise exc
169+
except FileNotFoundError:
170+
raise
171+
except PermissionError:
172+
raise
173173
except Exception as exc:
174174
raise RuntimeError(f"GPU test binary execution failed: {exc}") from exc
175175

@@ -214,8 +214,8 @@ def _run_gpu_test_fallback() -> None:
214214
raise RuntimeError("nvidia-smi not found. Cannot validate GPU availability.") from None
215215
except subprocess.TimeoutExpired:
216216
raise RuntimeError("nvidia-smi timed out") from None
217-
except Exception as exc:
218-
raise exc
217+
except RuntimeError:
218+
raise
219219

220220

221221
async def _check_gpu_health() -> None:

runpod/serverless/modules/rp_system_fitness.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -161,11 +161,11 @@ async def _check_network_connectivity() -> None:
161161
port = 53
162162

163163
try:
164-
start_time = time.time()
164+
start_time = time.perf_counter()
165165
reader, writer = await asyncio.wait_for(
166166
asyncio.open_connection(host, port), timeout=NETWORK_CHECK_TIMEOUT
167167
)
168-
elapsed_ms = (time.time() - start_time) * 1000
168+
elapsed_ms = (time.perf_counter() - start_time) * 1000
169169
writer.close()
170170
await writer.wait_closed()
171171

@@ -374,15 +374,15 @@ async def _check_gpu_compute_benchmark() -> None:
374374

375375
# Create small matrix on GPU
376376
size = 1024
377-
start_time = time.time()
377+
start_time = time.perf_counter()
378378

379379
# Do computation
380380
A = torch.randn(size, size, device="cuda")
381381
B = torch.randn(size, size, device="cuda")
382382
torch.matmul(A, B)
383383
torch.cuda.synchronize() # Wait for GPU to finish
384384

385-
elapsed_ms = (time.time() - start_time) * 1000
385+
elapsed_ms = (time.perf_counter() - start_time) * 1000
386386
max_ms = GPU_BENCHMARK_TIMEOUT * 1000
387387

388388
if elapsed_ms > max_ms:
@@ -404,14 +404,14 @@ async def _check_gpu_compute_benchmark() -> None:
404404
import cupy as cp
405405

406406
size = 1024
407-
start_time = time.time()
407+
start_time = time.perf_counter()
408408

409409
A = cp.random.randn(size, size)
410410
B = cp.random.randn(size, size)
411411
cp.matmul(A, B)
412412
cp.cuda.Device().synchronize()
413413

414-
elapsed_ms = (time.time() - start_time) * 1000
414+
elapsed_ms = (time.perf_counter() - start_time) * 1000
415415
max_ms = GPU_BENCHMARK_TIMEOUT * 1000
416416

417417
if elapsed_ms > max_ms:

0 commit comments

Comments
 (0)