|
| 1 | +#!/bin/bash |
| 2 | +# Unified SLURM job submission and monitoring for all clusters. |
| 3 | +# Submits a script as a SLURM batch job, then monitors it until completion. |
| 4 | +# Rerun-safe: cancels stale jobs from previous runs before resubmission. |
| 5 | +# |
| 6 | +# Usage: submit-slurm-job.sh <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard] |
| 7 | + |
| 8 | +set -euo pipefail |
| 9 | + |
| 10 | +# Ignore SIGHUP to survive login node session drops |
| 11 | +trap '' HUP |
| 12 | + |
| 13 | +usage() { |
| 14 | + echo "Usage: $0 <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard]" |
| 15 | +} |
| 16 | + |
| 17 | +script_path="${1:-}" |
| 18 | +device="${2:-}" |
| 19 | +interface="${3:-}" |
| 20 | +cluster="${4:-}" |
| 21 | +shard="${5:-}" |
| 22 | + |
| 23 | +if [ -z "$script_path" ] || [ -z "$device" ] || [ -z "$interface" ] || [ -z "$cluster" ]; then |
| 24 | + usage |
| 25 | + exit 1 |
| 26 | +fi |
| 27 | + |
| 28 | +sbatch_script_contents=$(cat "$script_path") |
| 29 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 30 | + |
| 31 | +# Detect job type from submitted script basename |
| 32 | +script_basename="$(basename "$script_path" .sh)" |
| 33 | +case "$script_basename" in |
| 34 | + bench*) job_type="bench" ;; |
| 35 | + *) job_type="test" ;; |
| 36 | +esac |
| 37 | + |
| 38 | +# --- Cluster configuration --- |
| 39 | +case "$cluster" in |
| 40 | + phoenix) |
| 41 | + compiler_flag="p" |
| 42 | + account="gts-sbryngelson3" |
| 43 | + job_prefix="shb" |
| 44 | + qos="embers" |
| 45 | + extra_sbatch="#SBATCH --requeue" |
| 46 | + test_time="03:00:00" |
| 47 | + bench_time="04:00:00" |
| 48 | + gpu_partition_dynamic=true |
| 49 | + ;; |
| 50 | + frontier) |
| 51 | + compiler_flag="f" |
| 52 | + account="CFD154" |
| 53 | + job_prefix="MFC" |
| 54 | + qos="develop" |
| 55 | + extra_sbatch="" |
| 56 | + test_time="01:59:00" |
| 57 | + bench_time="01:59:00" |
| 58 | + gpu_partition_dynamic=false |
| 59 | + ;; |
| 60 | + frontier_amd) |
| 61 | + compiler_flag="famd" |
| 62 | + account="CFD154" |
| 63 | + job_prefix="MFC" |
| 64 | + qos="develop" |
| 65 | + extra_sbatch="" |
| 66 | + test_time="01:59:00" |
| 67 | + bench_time="01:59:00" |
| 68 | + gpu_partition_dynamic=false |
| 69 | + ;; |
| 70 | + *) |
| 71 | + echo "ERROR: Unknown cluster '$cluster'" |
| 72 | + exit 1 |
| 73 | + ;; |
| 74 | +esac |
| 75 | + |
| 76 | +# --- Time limit --- |
| 77 | +if [ "$job_type" = "bench" ]; then |
| 78 | + sbatch_time="#SBATCH -t $bench_time" |
| 79 | +else |
| 80 | + sbatch_time="#SBATCH -t $test_time" |
| 81 | +fi |
| 82 | + |
| 83 | +# --- Device-specific SBATCH options --- |
| 84 | +if [ "$device" = "cpu" ]; then |
| 85 | + case "$cluster" in |
| 86 | + phoenix) |
| 87 | + sbatch_device_opts="\ |
| 88 | +#SBATCH -p cpu-small |
| 89 | +#SBATCH --ntasks-per-node=24 |
| 90 | +#SBATCH --mem-per-cpu=2G" |
| 91 | + ;; |
| 92 | + frontier|frontier_amd) |
| 93 | + sbatch_device_opts="\ |
| 94 | +#SBATCH -n 32 |
| 95 | +#SBATCH -p service" |
| 96 | + ;; |
| 97 | + esac |
| 98 | +elif [ "$device" = "gpu" ]; then |
| 99 | + # Determine GPU partition |
| 100 | + gpu_partition="batch" |
| 101 | + if [ "$gpu_partition_dynamic" = "true" ]; then |
| 102 | + # Use pre-selected bench partition if available, otherwise query sinfo |
| 103 | + if [ -n "${BENCH_GPU_PARTITION:-}" ]; then |
| 104 | + gpu_partition="$BENCH_GPU_PARTITION" |
| 105 | + echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)" |
| 106 | + else |
| 107 | + source "${SCRIPT_DIR}/select-gpu-partition.sh" |
| 108 | + gpu_partition="$SELECTED_GPU_PARTITION" |
| 109 | + fi |
| 110 | + fi |
| 111 | + |
| 112 | + case "$cluster" in |
| 113 | + phoenix) |
| 114 | + sbatch_device_opts="\ |
| 115 | +#SBATCH -p $gpu_partition |
| 116 | +#SBATCH --ntasks-per-node=4 |
| 117 | +#SBATCH -G2 |
| 118 | +#SBATCH --exclude=atl1-1-03-002-29-0" |
| 119 | + ;; |
| 120 | + frontier|frontier_amd) |
| 121 | + sbatch_device_opts="\ |
| 122 | +#SBATCH -n 8 |
| 123 | +#SBATCH -p service" |
| 124 | + ;; |
| 125 | + esac |
| 126 | +else |
| 127 | + usage |
| 128 | + exit 1 |
| 129 | +fi |
| 130 | + |
| 131 | +# --- Job slug --- |
| 132 | +shard_suffix="" |
| 133 | +if [ -n "$shard" ]; then |
| 134 | + shard_suffix="-$(echo "$shard" | sed 's|/|-of-|')" |
| 135 | +fi |
| 136 | +job_slug="$(basename "$script_path" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-${device}-${interface}${shard_suffix}" |
| 137 | +output_file="$job_slug.out" |
| 138 | +id_file="${job_slug}.slurm_job_id" |
| 139 | + |
| 140 | +# --- Idempotency: cancel stale jobs from previous runs --- |
| 141 | +if [ -f "$id_file" ]; then |
| 142 | + existing_id=$(cat "$id_file") |
| 143 | + state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true) |
| 144 | + case "${state:-UNKNOWN}" in |
| 145 | + RUNNING|PENDING|REQUEUED|COMPLETING) |
| 146 | + echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission" |
| 147 | + scancel "$existing_id" 2>/dev/null || true |
| 148 | + ;; |
| 149 | + *) |
| 150 | + echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh" |
| 151 | + ;; |
| 152 | + esac |
| 153 | + rm -f "$id_file" |
| 154 | +fi |
| 155 | + |
| 156 | +# Remove stale output file so the monitor doesn't pick up old content |
| 157 | +# (a previous SLURM job's epilog can write to the .out file after our |
| 158 | +# stale-job check, polluting the new job's output stream). |
| 159 | +rm -f "$output_file" |
| 160 | + |
| 161 | +# --- Module load mode (short form) --- |
| 162 | +module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c") |
| 163 | + |
| 164 | +# --- Submit --- |
| 165 | +submit_output=$(sbatch <<EOT |
| 166 | +#!/bin/bash |
| 167 | +#SBATCH -J ${job_prefix}-${job_slug} |
| 168 | +#SBATCH --account=${account} |
| 169 | +#SBATCH -N 1 |
| 170 | +${sbatch_device_opts} |
| 171 | +${sbatch_time} |
| 172 | +#SBATCH --qos=${qos} |
| 173 | +${extra_sbatch} |
| 174 | +#SBATCH -o ${output_file} |
| 175 | +
|
| 176 | +set -e |
| 177 | +set -x |
| 178 | +
|
| 179 | +cd "\$SLURM_SUBMIT_DIR" |
| 180 | +echo "Running in \$(pwd):" |
| 181 | +
|
| 182 | +job_slug="$job_slug" |
| 183 | +job_device="$device" |
| 184 | +job_interface="$interface" |
| 185 | +job_shard="$shard" |
| 186 | +job_cluster="$cluster" |
| 187 | +
|
| 188 | +. ./mfc.sh load -c $compiler_flag -m $module_mode |
| 189 | +
|
| 190 | +$sbatch_script_contents |
| 191 | +
|
| 192 | +EOT |
| 193 | +) |
| 194 | + |
| 195 | +job_id=$(echo "$submit_output" | grep -oE '[0-9]+') |
| 196 | +if [ -z "$job_id" ]; then |
| 197 | + echo "ERROR: Failed to submit job. sbatch output:" |
| 198 | + echo "$submit_output" |
| 199 | + exit 1 |
| 200 | +fi |
| 201 | + |
| 202 | +echo "Submitted batch job $job_id" |
| 203 | +echo "$job_id" > "$id_file" |
| 204 | +echo "Job ID written to $id_file" |
| 205 | + |
| 206 | +# --- Monitor --- |
| 207 | +bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file" |
0 commit comments