Skip to content

Commit 46866ab

Browse files
Pennant experiment scripts.
1 parent fe3e99f commit 46866ab

File tree

2 files changed

+47
-2
lines changed

2 files changed

+47
-2
lines changed

experiment/replay_cpp_pennant.sh

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/sh
2+
#SBATCH --constraint=gpu
3+
#SBATCH --dependency=singleton
4+
#SBATCH --job-name=replay_test
5+
#SBATCH --time=01:30:00
6+
#SBATCH --mail-type=ALL
7+
8+
root_dir="$PWD"
9+
10+
export LD_LIBRARY_PATH="$PWD"
11+
12+
ulimit -S -c 0 # disable core dumps
13+
14+
experiment_name="$(basename "$root_dir")"
15+
16+
# 1000 iterations runs about 100 seconds, so we're going to do
17+
# 20 * 1000 = 10000 iterations to run about 33 minutes
18+
19+
num_checkpoints=20
20+
21+
if [[ ! -d checkpoint ]]; then mkdir checkpoint; fi
22+
pushd checkpoint
23+
24+
for n in $SLURM_JOB_NUM_NODES; do
25+
freq=1000
26+
slug="${n}x1_f${freq}_orig"
27+
echo "Running $slug"
28+
checkpoint_dir="$SCRATCH/$experiment_name/$slug"
29+
mkdir -p "$checkpoint_dir"
30+
31+
srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none "$root_dir/pennant" -n "$n" -f "$root_dir"/pennant.tests/leblanc_long"$(( n * 4 ))"x"$(( num_checkpoints * 1000 ))"/leblanc.pnt -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:prefix "$checkpoint_dir" -checkpoint:auto_steps $freq | tee out_"$slug".out
32+
33+
for replay in $(seq 0 $(( num_checkpoints - 1 )) ); do
34+
for rep in 0; do
35+
slug="${n}x1_f${freq}_replay${replay}_r${rep}"
36+
echo "Running $slug"
37+
srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none "$root_dir/pennant" -n "$n" -f "$root_dir"/pennant.tests/leblanc_long"$(( n * 4 ))"x"$(( num_checkpoints * 1000 ))"/leblanc.pnt -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:prefix "$checkpoint_dir" -checkpoint:replay $replay -checkpoint:auto_steps $freq -checkpoint:measure_replay_time_and_exit | tee out_"$slug".out
38+
done
39+
done
40+
41+
# # Clean up checkpoints, otherwise we use too much space
42+
# rm -rf "$checkpoint_dir"
43+
done
44+
45+
popd

experiment/sbatch_cpp_pennant.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ for n in $SLURM_JOB_NUM_NODES; do
2323
echo "Running $slug"
2424
checkpoint_dir="$SCRATCH/$experiment_name/$slug"
2525
mkdir -p "$checkpoint_dir"
26-
srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none "$root_dir/pennant" -n "$n" -f "$root_dir"/pennant.tests/leblanc_long"$(( n * 4 ))"x1000/leblanc.pnt -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:prefix "$checkpoint_dir" -checkpoint:auto_steps $freq | tee out_"$slug".out
26+
srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none "$root_dir/pennant" -n "$n" -f "$root_dir"/pennant.tests/leblanc_long"$(( n * 4 ))"x1000/leblanc.pnt -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:prefix "$checkpoint_dir" -checkpoint:auto_steps $freq | tee out_"$slug".out
2727
# -dm:memoize -lg:parallel_replay 2
2828

2929
# Clean up frequent checkpoints, otherwise we use too much space
@@ -44,7 +44,7 @@ for n in $SLURM_JOB_NUM_NODES; do
4444
freq=0
4545
slug="${n}x1_f${freq}_r${r}"
4646
echo "Running $slug"
47-
srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none "$root_dir/pennant" -n "$n" -f "$root_dir"/pennant.tests/leblanc_long"$(( n * 4 ))"x1000/leblanc.pnt -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:disable | tee out_"$slug".out
47+
srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none "$root_dir/pennant" -n "$n" -f "$root_dir"/pennant.tests/leblanc_long"$(( n * 4 ))"x1000/leblanc.pnt -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:disable | tee out_"$slug".out
4848
# -dm:memoize -lg:parallel_replay 2
4949
done
5050
done

0 commit comments

Comments
 (0)