File tree Expand file tree Collapse file tree 2 files changed +47
-2
lines changed Expand file tree Collapse file tree 2 files changed +47
-2
lines changed Original file line number Diff line number Diff line change
1
+ #! /bin/sh
2
+ # SBATCH --constraint=gpu
3
+ # SBATCH --dependency=singleton
4
+ # SBATCH --job-name=replay_test
5
+ # SBATCH --time=01:30:00
6
+ # SBATCH --mail-type=ALL
7
+
8
+ root_dir=" $PWD "
9
+
10
+ export LD_LIBRARY_PATH=" $PWD "
11
+
12
+ ulimit -S -c 0 # disable core dumps
13
+
14
+ experiment_name=" $( basename " $root_dir " ) "
15
+
16
+ # 1000 iterations runs about 100 seconds, so we're going to do
17
+ # 20 * 1000 = 10000 iterations to run about 33 minutes
18
+
19
+ num_checkpoints=20
20
+
21
+ if [[ ! -d checkpoint ]]; then mkdir checkpoint; fi
22
+ pushd checkpoint
23
+
24
+ for n in $SLURM_JOB_NUM_NODES ; do
25
+ freq=1000
26
+ slug=" ${n} x1_f${freq} _orig"
27
+ echo " Running $slug "
28
+ checkpoint_dir=" $SCRATCH /$experiment_name /$slug "
29
+ mkdir -p " $checkpoint_dir "
30
+
31
+ srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none " $root_dir /pennant" -n " $n " -f " $root_dir " /pennant.tests/leblanc_long" $(( n * 4 )) " x" $(( num_checkpoints * 1000 )) " /leblanc.pnt -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_" $slug " _%.log -checkpoint:prefix " $checkpoint_dir " -checkpoint:auto_steps $freq | tee out_" $slug " .out
32
+
33
+ for replay in $( seq 0 $(( num_checkpoints - 1 )) ) ; do
34
+ for rep in 0; do
35
+ slug=" ${n} x1_f${freq} _replay${replay} _r${rep} "
36
+ echo " Running $slug "
37
+ srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none " $root_dir /pennant" -n " $n " -f " $root_dir " /pennant.tests/leblanc_long" $(( n * 4 )) " x" $(( num_checkpoints * 1000 )) " /leblanc.pnt -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_" $slug " _%.log -checkpoint:prefix " $checkpoint_dir " -checkpoint:replay $replay -checkpoint:auto_steps $freq -checkpoint:measure_replay_time_and_exit | tee out_" $slug " .out
38
+ done
39
+ done
40
+
41
+ # # Clean up checkpoints, otherwise we use too much space
42
+ # rm -rf "$checkpoint_dir"
43
+ done
44
+
45
+ popd
Original file line number Diff line number Diff line change @@ -23,7 +23,7 @@ for n in $SLURM_JOB_NUM_NODES; do
23
23
echo " Running $slug "
24
24
checkpoint_dir=" $SCRATCH /$experiment_name /$slug "
25
25
mkdir -p " $checkpoint_dir "
26
- srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none " $root_dir /pennant" -n " $n " -f " $root_dir " /pennant.tests/leblanc_long" $(( n * 4 )) " x1000/leblanc.pnt -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_" $slug " _%.log -checkpoint:prefix " $checkpoint_dir " -checkpoint:auto_steps $freq | tee out_" $slug " .out
26
+ srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none " $root_dir /pennant" -n " $n " -f " $root_dir " /pennant.tests/leblanc_long" $(( n * 4 )) " x1000/leblanc.pnt -prune 30 - hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_" $slug " _%.log -checkpoint:prefix " $checkpoint_dir " -checkpoint:auto_steps $freq | tee out_" $slug " .out
27
27
# -dm:memoize -lg:parallel_replay 2
28
28
29
29
# Clean up frequent checkpoints, otherwise we use too much space
@@ -44,7 +44,7 @@ for n in $SLURM_JOB_NUM_NODES; do
44
44
freq=0
45
45
slug=" ${n} x1_f${freq} _r${r} "
46
46
echo " Running $slug "
47
- srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none " $root_dir /pennant" -n " $n " -f " $root_dir " /pennant.tests/leblanc_long" $(( n * 4 )) " x1000/leblanc.pnt -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_" $slug " _%.log -checkpoint:disable | tee out_" $slug " .out
47
+ srun -n $n -N $n --ntasks-per-node 1 --cpu_bind none " $root_dir /pennant" -n " $n " -f " $root_dir " /pennant.tests/leblanc_long" $(( n * 4 )) " x1000/leblanc.pnt -prune 30 - hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 4 -ll:csize 13000 -ll:fsize 13000 -ll:zsize 36000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 5 -lg:no_tracing -level 3 -logfile log_" $slug " _%.log -checkpoint:disable | tee out_" $slug " .out
48
48
# -dm:memoize -lg:parallel_replay 2
49
49
done
50
50
done
You can’t perform that action at this time.
0 commit comments