Skip to content

Commit af4809e

Browse files
authored
Merge branch 'master' into ruff
2 parents 4de1749 + 506c6f5 commit af4809e

22 files changed

+419
-476
lines changed

.github/scripts/prebuild-case-optimization.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
#!/bin/bash
22

33
# Pre-builds all benchmark cases with --case-optimization.
4+
# No GPU hardware needed — compilation only.
45
# Can run in two modes:
56
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
6-
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
7+
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
78
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
89

910
set -e
1011

11-
# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
12+
# Support both positional args (direct invocation) and env vars (SLURM)
1213
cluster="${1:-${job_cluster:-phoenix}}"
1314
job_device="${2:-$job_device}"
1415
job_interface="${3:-$job_interface}"
@@ -24,7 +25,15 @@ esac
2425
rm -rf build
2526

2627
. ./mfc.sh load -c "$flag" -m g
27-
source .github/scripts/gpu-opts.sh
28+
29+
# Set GPU build flags from interface — this is always a GPU build.
30+
# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
31+
# to a CPU SLURM partition (no GPU hardware needed for compilation).
32+
case "$job_interface" in
33+
acc) gpu_opts="--gpu acc" ;;
34+
omp) gpu_opts="--gpu mp" ;;
35+
*) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
36+
esac
2837

2938
for case in benchmarks/*/case.py; do
3039
echo "=== Pre-building: $case ==="

.github/scripts/retry-build.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
#!/bin/bash
22
# Provides retry_build(): 2-attempt loop.
33
# On failure of attempt 1, nukes the entire build directory before attempt 2.
4-
# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
4+
# If RETRY_VALIDATE_CMD is set, runs it after a successful build; a non-zero
5+
# exit triggers the same nuke-and-retry, catching e.g. SIGILL from binaries
6+
# compiled on a different CPU architecture.
57
# Usage: source .github/scripts/retry-build.sh
68
# retry_build ./mfc.sh build -j 8 --gpu acc
9+
# RETRY_VALIDATE_CMD='./syscheck' retry_build ./mfc.sh build -j 8
710

811
retry_build() {
9-
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
1012
local max_attempts=2
13+
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
1114
local attempt=1
1215
while [ $attempt -le $max_attempts ]; do
1316
echo "Build attempt $attempt of $max_attempts..."

.github/scripts/run-tests-with-retry.sh

Lines changed: 0 additions & 34 deletions
This file was deleted.

.github/scripts/run_case_optimization.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ for case in "${benchmarks[@]}"; do
4444
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
4545

4646
# Build + run with --case-optimization, small grid, 10 timesteps
47-
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then
47+
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
4848
# Validate output
4949
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
5050
echo "PASS: $case_name"

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,9 @@ echo "=========================================="
2424
# both parallel jobs so PR and master always land on the same GPU type.
2525
if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
2626
echo "Selecting Phoenix GPU partition for benchmark consistency..."
27-
# Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
28-
# large modern nodes (h200, h100, a100) free for production workloads.
29-
# rtx6000 has the most nodes and gives the most consistent baselines.
30-
BENCH_GPU_PARTITION=""
31-
for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
32-
# || true: grep -c exits 1 on zero matches (or when sinfo returns no output
33-
# for an unknown partition); suppress so set -euo pipefail doesn't abort.
34-
idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
35-
if [ "${idle:-0}" -gt 0 ]; then
36-
BENCH_GPU_PARTITION="$part"
37-
echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
38-
break
39-
fi
40-
done
41-
if [ -z "$BENCH_GPU_PARTITION" ]; then
42-
echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
43-
BENCH_GPU_PARTITION="gpu-rtx6000"
44-
fi
27+
# Require 2 nodes so both PR and master jobs can run concurrently.
28+
GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh"
29+
BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION"
4530
export BENCH_GPU_PARTITION
4631
fi
4732

@@ -57,12 +42,13 @@ echo "Master job started in background (PID: $master_pid)"
5742

5843
echo "Waiting for both jobs to complete..."
5944

60-
# Wait and capture exit codes reliably
45+
# Wait and capture exit codes reliably.
46+
# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
47+
# (which would orphan the second job).
6148
pr_exit=0
6249
master_exit=0
6350

64-
wait "$pr_pid"
65-
pr_exit=$?
51+
wait "$pr_pid" || pr_exit=$?
6652
if [ "$pr_exit" -ne 0 ]; then
6753
echo "PR job exited with code: $pr_exit"
6854
echo "Last 50 lines of PR job log:"
@@ -71,8 +57,7 @@ else
7157
echo "PR job completed successfully"
7258
fi
7359

74-
wait "$master_pid"
75-
master_exit=$?
60+
wait "$master_pid" || master_exit=$?
7661
if [ "$master_exit" -ne 0 ]; then
7762
echo "Master job exited with code: $master_exit"
7863
echo "Last 50 lines of master job log:"
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
# Select the best available Phoenix GPU partition using sinfo.
3+
# Sources into caller: exports SELECTED_GPU_PARTITION.
4+
#
5+
# Priority order prefers partitions most likely to have availability.
6+
# V100 is last due to slower performance near the test time limit.
7+
# Falls back to gpu-l40s if no partition meets the idle node threshold.
8+
# RTX 6000 nodes are excluded (too slow for the test suite time limit).
9+
#
10+
# Optional: set GPU_PARTITION_MIN_NODES before sourcing to require a minimum
11+
# number of idle/mix nodes (e.g. GPU_PARTITION_MIN_NODES=2 for parallel bench jobs).
12+
#
13+
# Usage: source .github/scripts/select-gpu-partition.sh
14+
15+
_GPU_PARTITION_PRIORITY="gpu-l40s gpu-h200 gpu-h100 gpu-a100 gpu-v100"
16+
_GPU_PARTITION_FALLBACK="gpu-l40s"
17+
_GPU_PARTITION_MIN_NODES="${GPU_PARTITION_MIN_NODES:-1}"
18+
19+
SELECTED_GPU_PARTITION=""
20+
for _part in $_GPU_PARTITION_PRIORITY; do
21+
_idle=$(sinfo -p "$_part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
22+
if [ "${_idle:-0}" -ge "$_GPU_PARTITION_MIN_NODES" ]; then
23+
SELECTED_GPU_PARTITION="$_part"
24+
echo "Selected GPU partition: $SELECTED_GPU_PARTITION ($_idle idle/mix nodes)"
25+
break
26+
fi
27+
done
28+
29+
if [ -z "$SELECTED_GPU_PARTITION" ]; then
30+
echo "WARNING: No idle GPU partition found; falling back to $_GPU_PARTITION_FALLBACK (may queue)"
31+
SELECTED_GPU_PARTITION="$_GPU_PARTITION_FALLBACK"
32+
fi
33+
34+
export SELECTED_GPU_PARTITION
35+
unset _GPU_PARTITION_PRIORITY _GPU_PARTITION_FALLBACK _GPU_PARTITION_MIN_NODES _part _idle
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#!/bin/bash
2+
# Unified SLURM job submission and monitoring for all clusters.
3+
# Submits a script as a SLURM batch job, then monitors it until completion.
4+
# Rerun-safe: cancels stale jobs from previous runs before resubmission.
5+
#
6+
# Usage: submit-slurm-job.sh <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard]
7+
8+
set -euo pipefail
9+
10+
# Ignore SIGHUP to survive login node session drops
11+
trap '' HUP
12+
13+
usage() {
14+
echo "Usage: $0 <script.sh> <cpu|gpu> <none|acc|omp> <cluster> [shard]"
15+
}
16+
17+
script_path="${1:-}"
18+
device="${2:-}"
19+
interface="${3:-}"
20+
cluster="${4:-}"
21+
shard="${5:-}"
22+
23+
if [ -z "$script_path" ] || [ -z "$device" ] || [ -z "$interface" ] || [ -z "$cluster" ]; then
24+
usage
25+
exit 1
26+
fi
27+
28+
sbatch_script_contents=$(cat "$script_path")
29+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
30+
31+
# Detect job type from submitted script basename
32+
script_basename="$(basename "$script_path" .sh)"
33+
case "$script_basename" in
34+
bench*) job_type="bench" ;;
35+
*) job_type="test" ;;
36+
esac
37+
38+
# --- Cluster configuration ---
39+
case "$cluster" in
40+
phoenix)
41+
compiler_flag="p"
42+
account="gts-sbryngelson3"
43+
job_prefix="shb"
44+
qos="embers"
45+
extra_sbatch="#SBATCH --requeue"
46+
test_time="03:00:00"
47+
bench_time="04:00:00"
48+
gpu_partition_dynamic=true
49+
;;
50+
frontier)
51+
compiler_flag="f"
52+
account="CFD154"
53+
job_prefix="MFC"
54+
qos="develop"
55+
extra_sbatch=""
56+
test_time="01:59:00"
57+
bench_time="01:59:00"
58+
gpu_partition_dynamic=false
59+
;;
60+
frontier_amd)
61+
compiler_flag="famd"
62+
account="CFD154"
63+
job_prefix="MFC"
64+
qos="develop"
65+
extra_sbatch=""
66+
test_time="01:59:00"
67+
bench_time="01:59:00"
68+
gpu_partition_dynamic=false
69+
;;
70+
*)
71+
echo "ERROR: Unknown cluster '$cluster'"
72+
exit 1
73+
;;
74+
esac
75+
76+
# --- Time limit ---
77+
if [ "$job_type" = "bench" ]; then
78+
sbatch_time="#SBATCH -t $bench_time"
79+
else
80+
sbatch_time="#SBATCH -t $test_time"
81+
fi
82+
83+
# --- Device-specific SBATCH options ---
84+
if [ "$device" = "cpu" ]; then
85+
case "$cluster" in
86+
phoenix)
87+
sbatch_device_opts="\
88+
#SBATCH -p cpu-small
89+
#SBATCH --ntasks-per-node=24
90+
#SBATCH --mem-per-cpu=2G"
91+
;;
92+
frontier|frontier_amd)
93+
sbatch_device_opts="\
94+
#SBATCH -n 32
95+
#SBATCH -p service"
96+
;;
97+
esac
98+
elif [ "$device" = "gpu" ]; then
99+
# Determine GPU partition
100+
gpu_partition="batch"
101+
if [ "$gpu_partition_dynamic" = "true" ]; then
102+
# Use pre-selected bench partition if available, otherwise query sinfo
103+
if [ -n "${BENCH_GPU_PARTITION:-}" ]; then
104+
gpu_partition="$BENCH_GPU_PARTITION"
105+
echo "Using pre-selected bench partition: $gpu_partition (PR/master consistency)"
106+
else
107+
source "${SCRIPT_DIR}/select-gpu-partition.sh"
108+
gpu_partition="$SELECTED_GPU_PARTITION"
109+
fi
110+
fi
111+
112+
case "$cluster" in
113+
phoenix)
114+
sbatch_device_opts="\
115+
#SBATCH -p $gpu_partition
116+
#SBATCH --ntasks-per-node=4
117+
#SBATCH -G2
118+
#SBATCH --exclude=atl1-1-03-002-29-0"
119+
;;
120+
frontier|frontier_amd)
121+
sbatch_device_opts="\
122+
#SBATCH -n 8
123+
#SBATCH -p service"
124+
;;
125+
esac
126+
else
127+
usage
128+
exit 1
129+
fi
130+
131+
# --- Job slug ---
132+
shard_suffix=""
133+
if [ -n "$shard" ]; then
134+
shard_suffix="-$(echo "$shard" | sed 's|/|-of-|')"
135+
fi
136+
job_slug="$(basename "$script_path" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-${device}-${interface}${shard_suffix}"
137+
output_file="$job_slug.out"
138+
id_file="${job_slug}.slurm_job_id"
139+
140+
# --- Idempotency: cancel stale jobs from previous runs ---
141+
if [ -f "$id_file" ]; then
142+
existing_id=$(cat "$id_file")
143+
state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
144+
case "${state:-UNKNOWN}" in
145+
RUNNING|PENDING|REQUEUED|COMPLETING)
146+
echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission"
147+
scancel "$existing_id" 2>/dev/null || true
148+
;;
149+
*)
150+
echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh"
151+
;;
152+
esac
153+
rm -f "$id_file"
154+
fi
155+
156+
# Remove stale output file so the monitor doesn't pick up old content
157+
# (a previous SLURM job's epilog can write to the .out file after our
158+
# stale-job check, polluting the new job's output stream).
159+
rm -f "$output_file"
160+
161+
# --- Module load mode (short form) ---
162+
module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c")
163+
164+
# --- Submit ---
165+
submit_output=$(sbatch <<EOT
166+
#!/bin/bash
167+
#SBATCH -J ${job_prefix}-${job_slug}
168+
#SBATCH --account=${account}
169+
#SBATCH -N 1
170+
${sbatch_device_opts}
171+
${sbatch_time}
172+
#SBATCH --qos=${qos}
173+
${extra_sbatch}
174+
#SBATCH -o ${output_file}
175+
176+
set -e
177+
set -x
178+
179+
cd "\$SLURM_SUBMIT_DIR"
180+
echo "Running in \$(pwd):"
181+
182+
job_slug="$job_slug"
183+
job_device="$device"
184+
job_interface="$interface"
185+
job_shard="$shard"
186+
job_cluster="$cluster"
187+
188+
. ./mfc.sh load -c $compiler_flag -m $module_mode
189+
190+
$sbatch_script_contents
191+
192+
EOT
193+
)
194+
195+
job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
196+
if [ -z "$job_id" ]; then
197+
echo "ERROR: Failed to submit job. sbatch output:"
198+
echo "$submit_output"
199+
exit 1
200+
fi
201+
202+
echo "Submitted batch job $job_id"
203+
echo "$job_id" > "$id_file"
204+
echo "Job ID written to $id_file"
205+
206+
# --- Monitor ---
207+
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"

0 commit comments

Comments
 (0)