From 753c4e2a20411b2d2764a3c2bdfc39151b966263 Mon Sep 17 00:00:00 2001 From: "Joshua A. Anderson" Date: Wed, 1 Nov 2023 12:44:16 -0400 Subject: [PATCH] Recommend srun instead of mpirun on great lakes. --- doc/clusters/greatlakes.rst | 16 ++++++++++++---- docker/greatlakes/test/job-cpu.sh | 8 +++----- docker/greatlakes/test/job-gpu.sh | 4 +--- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/doc/clusters/greatlakes.rst b/doc/clusters/greatlakes.rst index 6723c4be..08a46a9a 100644 --- a/doc/clusters/greatlakes.rst +++ b/doc/clusters/greatlakes.rst @@ -33,23 +33,31 @@ Use the following commands in your job scripts or interactively to execute softw Serial (or multithreaded) CPU jobs:: module load gcc/10.3.0 openmpi/4.1.6 singularity - mpirun -n 1 singularity exec --bind /scratch,/gpfs \ + srun -u --export=ALL -n 1 singularity exec --bind /scratch,/gpfs \ /scratch/your-account_root/your-account/$USER/software.sif command arguments Single GPU jobs:: module load gcc/10.3.0 openmpi/4.1.6 singularity - mpirun -n 1 singularity exec --bind /scratch,/gpfs --nv \ + srun -u --export=ALL -n 1 singularity exec --bind /scratch,/gpfs --nv \ /scratch/your-account_root/your-account/$USER/software.sif command arguments MPI parallel CPU jobs:: module load gcc/10.3.0 openmpi/4.1.6 singularity - mpirun singularity exec --bind /scratch,/gpfs \ + srun -u --export=ALL singularity exec --bind /scratch,/gpfs \ /scratch/your-account_root/your-account/$USER/software.sif command arguments MPI parallel GPU jobs:: module load gcc/10.3.0 openmpi/4.1.6 singularity - mpirun singularity exec --bind /scratch,/gpfs --nv \ + srun -u --export=ALL singularity exec --bind /scratch,/gpfs --nv \ /scratch/your-account_root/your-account/$USER/software.sif command arguments + +.. important:: + + Invoke parallel jobs with ``srun -u --export=ALL`` to ensure proper task distribution to the + requested resources (``mpirun`` oversubscribes resources in some cases). The ``--export=ALL`` + should be the default behavior but is not observed in testing. The ``-u`` option ensures that + the stdout and stderr output is written to the file immediately. Without ``-u``, srun on Great + Lakes tends to buffer output until the job completes. diff --git a/docker/greatlakes/test/job-cpu.sh b/docker/greatlakes/test/job-cpu.sh index d8105a54..63c3dc70 100644 --- a/docker/greatlakes/test/job-cpu.sh +++ b/docker/greatlakes/test/job-cpu.sh @@ -7,17 +7,15 @@ #SBATCH --export=ALL #SBATCH -t 0:10:00 -export PMIX_MCA_psec=^munge - module load gcc/10.3.0 openmpi/4.1.6 singularity set -x cd $SLURM_SUBMIT_DIR -mpirun -n 1 singularity exec --bind /scratch,/gpfs software.sif python3 serial-cpu.py +srun -u --export=ALL -n 1 singularity exec --bind /scratch,/gpfs software.sif python3 serial-cpu.py -mpirun singularity exec --bind /scratch,/gpfs software.sif python3 mpi-cpu.py +srun -u --export=ALL singularity exec --bind /scratch,/gpfs software.sif python3 mpi-cpu.py -mpirun singularity exec --bind /scratch,/gpfs software.sif /opt/osu-micro-benchmarks/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bibw +srun -u --export=ALL singularity exec --bind /scratch,/gpfs software.sif /opt/osu-micro-benchmarks/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bibw echo "Tests complete." diff --git a/docker/greatlakes/test/job-gpu.sh b/docker/greatlakes/test/job-gpu.sh index 1af2d25c..4150d3b7 100644 --- a/docker/greatlakes/test/job-gpu.sh +++ b/docker/greatlakes/test/job-gpu.sh @@ -7,13 +7,11 @@ #SBATCH --export=ALL #SBATCH -t 0:10:00 -export PMIX_MCA_psec=^munge - module load gcc/10.3.0 openmpi/4.1.6 singularity set -x cd $SLURM_SUBMIT_DIR -mpirun -n 1 singularity exec --bind /scratch,/gpfs --nv software.sif python3 serial-gpu.py +srun -u --export=ALL -n 1 singularity exec --bind /scratch,/gpfs --nv software.sif python3 serial-gpu.py echo "Tests complete."