From 1eb855c211e5fdd2c66e12770839c61b47dd9265 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 14 Mar 2026 09:14:53 -0400 Subject: [PATCH] ci: fix monitor hang when SLURM job is preempted and requeued When a job is preempted+requeued, sacct -X reports PREEMPTED for the original attempt even after the requeued run completes. The monitor excluded PREEMPTED from terminal states (correct for active requeues) but never detected the requeued completion via sacct, causing it to loop on state=PREEMPTED for hours until the GHA timeout killed it. Fix: when sacct -X returns PREEMPTED, also query without -X to find the requeued run's terminal state (COMPLETED, FAILED, etc). Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/scripts/monitor_slurm_job.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index 1142e97057..1f70a18d35 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -52,6 +52,15 @@ get_job_state() { # Fallback to sacct (works for completed/historical jobs) if command -v sacct >/dev/null 2>&1; then state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true) + # When a job is preempted+requeued, sacct -X reports PREEMPTED for the + # original attempt while the requeued run may have completed. Check all + # records (without -X) for a terminal state that supersedes PREEMPTED. + if [ "$state" = "PREEMPTED" ]; then + requeue_state=$(sacct -j "$jid" -n -P -o State 2>/dev/null | grep -v PREEMPTED | head -n1 | cut -d'|' -f1 || true) + if [ -n "$requeue_state" ]; then + state="$requeue_state" + fi + fi if [ -n "$state" ]; then echo "$state" return