From 7f617112fc68178d218180212254baa52ae9cbbe Mon Sep 17 00:00:00 2001 From: =oren <=oren.leung@gmail.com> Date: Sat, 8 Mar 2025 21:11:13 +0000 Subject: [PATCH 1/3] feat: all reduce bench slurm pyxis --- network/benchmarks/README.md | 8 ++++++- .../benchmarks/all_reduce_bench_pyxis.sbatch | 24 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 network/benchmarks/all_reduce_bench_pyxis.sbatch diff --git a/network/benchmarks/README.md b/network/benchmarks/README.md index 17acc64..afefc1a 100644 --- a/network/benchmarks/README.md +++ b/network/benchmarks/README.md @@ -79,9 +79,15 @@ Here is a simple all-reduce benchmark that you can use to quickly measure the th [all_reduce_bench.py](all_reduce_bench.py) +On CSPs that have enabled [SLURM Pyxis Container Plugin](https://github.com/NVIDIA/pyxis), such as CoreWeave, Crusoe, AWS, Oracle, Azure, GCP, etc, `all_reduce_bench.py` can be easily ran & reproduced via the following command: +```bash +sbatch -n ./all_reduce_bench_pyxis.sbatch +``` + Usually benchmarking at least 4 nodes is recommended, but, of course, if you already have access to all the nodes you will be using during the training, benchmark using all of the nodes. -To run it on 4 nodes: + +If do not you have access to an pyxis SLURM environment, to run it on 4 nodes: ``` GPUS_PER_NODE=8 diff --git a/network/benchmarks/all_reduce_bench_pyxis.sbatch b/network/benchmarks/all_reduce_bench_pyxis.sbatch new file mode 100644 index 0000000..c4a75b1 --- /dev/null +++ b/network/benchmarks/all_reduce_bench_pyxis.sbatch @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=all_reduce_bench_pyxis +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:8 +#SBATCH --time=01:00:00 + +# Set up environment variables for torchrun +GPUS_PER_NODE=8 +NNODES=2 +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 + +srun --container-image=nvcr.io#nvidia/pytorch:25.02-py3 \ + --container-mounts=$PWD:/workspace \ + python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --role `hostname -s`':' \ + --tee 3 \ + all_reduce_bench.py From 4952a40a2c5fb92a556296225f632180bce8b9e1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 8 Mar 2025 20:14:42 -0800 Subject: [PATCH 2/3] Update network/benchmarks/README.md --- network/benchmarks/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/network/benchmarks/README.md b/network/benchmarks/README.md index afefc1a..181f640 100644 --- a/network/benchmarks/README.md +++ b/network/benchmarks/README.md @@ -87,7 +87,7 @@ sbatch -n ./all_reduce_bench_pyxis.sbatch Usually benchmarking at least 4 nodes is recommended, but, of course, if you already have access to all the nodes you will be using during the training, benchmark using all of the nodes. -If do not you have access to an pyxis SLURM environment, to run it on 4 nodes: +If you do not have access to a pyxis SLURM environment, to run it on 4 nodes: ``` GPUS_PER_NODE=8 From 2b120e5971dd6be4612734f3cf497d11dc34e09f Mon Sep 17 00:00:00 2001 From: Oren <47992694+OrenLeung@users.noreply.github.com> Date: Sat, 8 Mar 2025 23:17:35 -0500 Subject: [PATCH 3/3] fix num nodes --- network/benchmarks/all_reduce_bench_pyxis.sbatch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/network/benchmarks/all_reduce_bench_pyxis.sbatch b/network/benchmarks/all_reduce_bench_pyxis.sbatch index c4a75b1..d68c2cc 100644 --- a/network/benchmarks/all_reduce_bench_pyxis.sbatch +++ b/network/benchmarks/all_reduce_bench_pyxis.sbatch @@ -7,7 +7,7 @@ # Set up environment variables for torchrun GPUS_PER_NODE=8 -NNODES=2 +NNODES=$SLURM_NNODES MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) MASTER_PORT=6000