Skip to content

Commit e59272a

Browse files
sbryngelsonclaude
andauthored
ci: fix monitor hang when SLURM job is preempted and requeued (#1311)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 25a074e commit e59272a

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

.github/scripts/monitor_slurm_job.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,15 @@ get_job_state() {
5252
# Fallback to sacct (works for completed/historical jobs)
5353
if command -v sacct >/dev/null 2>&1; then
5454
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
55+
# When a job is preempted+requeued, sacct -X reports PREEMPTED for the
56+
# original attempt while the requeued run may have completed. Check all
57+
# records (without -X) for a terminal state that supersedes PREEMPTED.
58+
if [ "$state" = "PREEMPTED" ]; then
59+
requeue_state=$(sacct -j "$jid" -n -P -o State 2>/dev/null | grep -v PREEMPTED | head -n1 | cut -d'|' -f1 || true)
60+
if [ -n "$requeue_state" ]; then
61+
state="$requeue_state"
62+
fi
63+
fi
5564
if [ -n "$state" ]; then
5665
echo "$state"
5766
return

0 commit comments

Comments
 (0)