Skip to content

Commit

Permalink
Merge pull request #55 from gkaf89/bugfix/update-srun-api
Browse files Browse the repository at this point in the history
Update srun api
  • Loading branch information
gkaf89 authored Apr 17, 2024
2 parents 5332976 + b7ba14a commit b14476f
Show file tree
Hide file tree
Showing 32 changed files with 9 additions and 102 deletions.
4 changes: 1 addition & 3 deletions docs/development/performance-debugging-tools/advisor.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,8 @@ module load toolchain/intel/2019a
module load perf/Advisor/2019_update4
module load vis/GTK+/3.24.8-GCCcore-8.2.0

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
export OMP_NUM_THREADS=16
advixe-cl -collect survey -project-dir my_result -- ./a.out

```


Expand Down Expand Up @@ -112,4 +110,4 @@ The below figure shows the hybrid(MPI+OpenMP) programming analysis results:

!!! tip
If you find some issues with the instructions above,
please report it to us using [support ticket](https://hpc.uni.lu/support).
please report it to us using [support ticket](https://hpc.uni.lu/support).
2 changes: 0 additions & 2 deletions docs/development/performance-debugging-tools/aps.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ module load swenv/default-env/v1.2-20191021-production
module load tools/VTune/2019_update4
module load toolchain/intel/2019a

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
export OMP_NUM_THREADS=16
aps --collection-mode=all -r report_output ./a.out
```
Expand All @@ -116,7 +115,6 @@ module load swenv/default-env/v1.2-20191021-production
module load tools/VTune/2019_update4
module load toolchain/intel/2019a

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
# To collect all the results
export MPS_STAT_LEVEL=${SLURM_CPUS_PER_TASK:-1}
# An option for the OpenMP+MPI application
Expand Down
3 changes: 1 addition & 2 deletions docs/development/performance-debugging-tools/arm-forge.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ module load toolchain/intel/2019a
module load tools/ArmForge/19.1
module load tools/ArmReports/19.1

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}

# for debugging
Expand Down Expand Up @@ -99,4 +98,4 @@ To see the result

!!! tip
If you find some issues with the instructions above,
please report it to us using [support ticket](https://hpc.uni.lu/support).
please report it to us using [support ticket](https://hpc.uni.lu/support).
1 change: 0 additions & 1 deletion docs/development/performance-debugging-tools/inspector.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ module load toolchain/intel/2019a
module load tools/Inspector/2019_update4
module load vis/GTK+/3.24.8-GCCcore-8.2.0

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
inspxe-cl -collect mi1 -result-dir mi1 -- ./a.out`
```
To see the result:
Expand Down
3 changes: 1 addition & 2 deletions docs/development/performance-debugging-tools/itac.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ module load toolchain/intel/2019a
module load tools/itac/2019.4.036
module load vis/GTK+/3.24.8-GCCcore-8.2.0

$ export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
$ export OMP_NUM_THREADS=16
$ -trace-collective ./a.out
```
Expand Down Expand Up @@ -102,4 +101,4 @@ $ stftool tracefile.stf --print-statistics

!!! tip
If you find some issues with the instructions above,
please report it to us using [support ticket](https://hpc.uni.lu/support).
please report it to us using [support ticket](https://hpc.uni.lu/support).
3 changes: 1 addition & 2 deletions docs/development/performance-debugging-tools/scalasca.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ module load toolchain/foss/2018a
module load perf/Scalasca/2.3.1-foss-2018a
module load perf/Score-P/3.1-foss-2018a

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
export OMP_NUM_THREADS=16

# analyze
Expand Down Expand Up @@ -100,4 +99,4 @@ $ scalasca -examine result_folder
```
!!! tip
If you find some issues with the instructions above,
please report it to us using [support ticket](https://hpc.uni.lu/support).
please report it to us using [support ticket](https://hpc.uni.lu/support).
1 change: 0 additions & 1 deletion docs/development/performance-debugging-tools/vtune.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ module load toolchain/intel/2019a
module load tools/VTune/2019_update4
module load vis/GTK+/3.24.8-GCCcore-8.2.0

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
export OMP_NUM_THREADS=16
amplxe-cl -collect hotspots-r my_result ./a.out
```
Expand Down
1 change: 0 additions & 1 deletion docs/environment/conda.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ echo "Numb. of cores: ${SLURM_CPUS_PER_TASK}"
micromamba activate R-project
export SRUN_CPUS_PER_TASK="${SLURM_CPUS_PER_TASK}"
export OMP_NUM_THREADS=1
srun Rscript --no-save --no-restore script.R
Expand Down
1 change: 0 additions & 1 deletion docs/jobs/gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
module load numlib/cuDNN # Example with cuDNN

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK # Propagate Slurm 'cpus-per-task' to srun
[...]
```

Expand Down
1 change: 0 additions & 1 deletion docs/services/jupyter.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ The following script is an example how to proceed:

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

# Python 3.X by default (also on system)
module load lang/Python
Expand Down
10 changes: 5 additions & 5 deletions docs/slurm/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,12 @@ Within a job, you aim at running a certain number of **tasks**, and Slurm allow

The total number of tasks defined in a given job is stored in the `$SLURM_NTASKS` environment variable.

!!! note "--cpus-per-task in srun since Slurm 22.05"
Beginning with Slurm 22.05, srun will not inherit the --cpus-per-task value requested by salloc or sbatch. It must be requested again with the call to srun or set with the SRUN_CPUS_PER_TASK environment variable if desired for the task(s).
!!! note "The --cpus-per-task option of srun in Slurm 23.11 and later"
In the latest versions of Slurm `srun` inherits the `--cpus-per-task` value requested by `salloc` or `sbatch` by reading the value of `SLURM_CPUS_PER_TASK`, as for any other option. _This behavior may differ from some older versions where special handling was required to propagate the `--cpus-per-task` option to `srun`._

This is very convenient to abstract from the job context to run MPI tasks/processes in parallel using for instance:
In case you would like to launch multiple programs in a single allocation/batch script, divide the resources accordingly by requesting resources with `srun` when launching the process, for instance:
```bash
srun -c ${SLURM_CPUS_PER_TASK} -n ${SLURM_NTASKS} [...]
srun --cpus-per-task <some of the SLURM_CPUS_PER_TASK> --ntasks <some of the SLURM_NTASKS> [...] <program>
```

We encourage you to **always** explicitly specify upon resource allocation the number of tasks you want _per_ node/socket (`--ntasks-per-node <n> --ntasks-per-socket <s>`), to easily scale on multiple nodes with `-N <N>`. Adapt the number of threads and the settings to match the physical NUMA characteristics of the nodes
Expand Down Expand Up @@ -344,5 +344,5 @@ submitted.
| `-N <N>` | `SLURM_JOB_NUM_NODES` or<br/> `SLURM_NNODES` | |
| `--ntasks-per-node=<n>` | `SLURM_NTASKS_PER_NODE` | |
| `--ntasks-per-socket=<s>` | `SLURM_NTASKS_PER_SOCKET` | |
| `-c <c>` | `SLURM_CPUS_PER_TASK` | `OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}` and `SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}` |
| `-c <c>` | `SLURM_CPUS_PER_TASK` | `OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}` |
| | `SLURM_NTASKS`<br/> Total number of tasks | `srun -n $SLURM_NTASKS [...]` |
20 changes: 0 additions & 20 deletions docs/slurm/launchers.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,6 @@ When setting your default `#SBATCH` directive, always keep in mind your expected
module purge || print_error_and_exit "No 'module' command"
# List modules required for execution of the task
module load <...>
# Propagate Slurm "-c" option to srun
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
# [...]
```

Expand All @@ -121,7 +119,6 @@ When setting your default `#SBATCH` directive, always keep in mind your expected

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load <...>
# [...]
```
Expand All @@ -139,7 +136,6 @@ When setting your default `#SBATCH` directive, always keep in mind your expected

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load <...>
# [...]
```
Expand Down Expand Up @@ -193,7 +189,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
# C/C++: module load toolchain/intel # OR: module load toolchain/foss
# Java: module load lang/Java/1.8
# Ruby/Perl/Rust...: module load lang/{Ruby,Perl,Rust...}
Expand All @@ -216,7 +211,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
# Python 3.X by default (also on system)
module load lang/Python
# module load lang/SciPy-bundle
Expand All @@ -240,7 +234,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load lang/R
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
OPTS=$*
Expand All @@ -262,7 +255,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load math/MATLAB

matlab -nodisplay -nosplash < INPUTFILE.m > OUTPUTFILE.out
Expand All @@ -287,7 +279,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load <...>
# [...]
```
Expand All @@ -305,7 +296,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load <...> # USE apps compiled against the {foss,intel}cuda toolchain !
# Ex:
# module load numlib/cuDNN
Expand Down Expand Up @@ -339,7 +329,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/foss

export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
Expand All @@ -361,7 +350,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/foss

export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
Expand Down Expand Up @@ -391,7 +379,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/intel
OPTS=$*

Expand All @@ -413,7 +400,6 @@ Luckily, we have prepared a [generic GNU Parallel launcher](https://github.com/U

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/intel
OPTS=$*

Expand Down Expand Up @@ -442,7 +428,6 @@ You may want to use [PMIx](https://pmix.github.io/standard) as MPI initiator --

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/foss
module load mpi/OpenMPI
OPTS=$*
Expand All @@ -464,7 +449,6 @@ You may want to use [PMIx](https://pmix.github.io/standard) as MPI initiator --

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/foss
module load mpi/OpenMPI
OPTS=$*
Expand All @@ -489,7 +473,6 @@ You may want to use [PMIx](https://pmix.github.io/standard) as MPI initiator --

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/intel
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
OPTS=$*
Expand All @@ -512,7 +495,6 @@ You may want to use [PMIx](https://pmix.github.io/standard) as MPI initiator --

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/intel
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
OPTS=$*
Expand All @@ -537,7 +519,6 @@ You may want to use [PMIx](https://pmix.github.io/standard) as MPI initiator --

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/foss
module load mpi/OpenMPI
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
Expand All @@ -561,7 +542,6 @@ You may want to use [PMIx](https://pmix.github.io/standard) as MPI initiator --

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load toolchain/foss
module load mpi/OpenMPI
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
Expand Down
4 changes: 0 additions & 4 deletions docs/software/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,6 @@ From that point, the compiled software and associated module is available in you
#SBATCH -c <thread>
print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
# Propagate Slurm "cpus-per-task" to srun
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
# Safeguard for NOT running this launcher on access/login nodes
module purge || print_error_and_exit "No 'module' command"
Expand Down Expand Up @@ -211,8 +209,6 @@ From that point, the compiled software and associated module is available in the
#SBATCH -c <thread>
print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
# Propagate Slurm "cpus-per-task" to srun
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
# Safeguard for NOT running this launcher on access/login nodes
module purge || print_error_and_exit "No 'module' command"
Expand Down
6 changes: 0 additions & 6 deletions docs/software/cae/abaqus.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ $ si --x11 -c 8 # Abaqus mp_mode=threads test
# OR
$ si --x11 --ntask-per-node 8 # abaqus mp_mode=mpi test

# Propagate Slurm "cpus-per-task / -c" to srun
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

# Load the module ABAQUS and needed environment
(node)$ module purge
(node)$ module load cae/ABAQUS
Expand Down Expand Up @@ -161,7 +158,6 @@ abaqus job=<jobname> resume

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load cae/ABAQUS
# export LM_LICENSE_FILE=[...]
unset SLURM_GTIDS
Expand All @@ -185,7 +181,6 @@ abaqus job=<jobname> resume

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load cae/ABAQUS
# export LM_LICENSE_FILE=[...]
unset SLURM_GTIDS
Expand All @@ -211,7 +206,6 @@ abaqus job=<jobname> resume

print_error_and_exit() { echo "***ERROR*** $*"; exit 1; }
module purge || print_error_and_exit "No 'module' command"
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
module load cae/ABAQUS
# export LM_LICENSE_FILE=[...]
unset SLURM_GTIDS
Expand Down
4 changes: 0 additions & 4 deletions docs/software/cae/ansys.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ $ ssh -X iris-cluster
# Reserve the node for interactive computation
$ salloc -p interactive --time=00:30:00 --ntasks 1 -c 4 --x11

# Propagate Slurm "cpus-per-task / -c" to srun
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

# Load the required version of ANSYS and needed environment
$ module purge
$ module load toolchain/intel/2019a
Expand Down Expand Up @@ -70,7 +67,6 @@ module load tools/ANSYS/19.4
# The Input file
defFile=Benchmark.def

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
MYHOSTLIST=$(srun hostname | sort | uniq -c | awk '{print $2 "*" $1}' | paste -sd, -)
echo $MYHOSTLIST
cfx5solve -double -def $defFile -start-method "Platform MPI Distributed Parallel" -par-dist $MYHOSTLIST
Expand Down
2 changes: 0 additions & 2 deletions docs/software/cae/fds.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ $ salloc -p interactive --time=00:30:00 --ntasks 1 -c 4 --x11
$ module purge
$ module load swenv/default-env/devel
$ module load phys/FDS/6.7.3-intel-2019a
$ export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

# Example in fds
$ fds example.fds
Expand Down Expand Up @@ -61,7 +60,6 @@ module purge
module load swenv/default-env/devel
module load phys/FDS/6.7.3-intel-2019a

export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
srun fds example.fds
```

Expand Down
Loading

0 comments on commit b14476f

Please sign in to comment.