Skip to content

Commit

Permalink
Update slurm to 24.05.5
Browse files Browse the repository at this point in the history
Fix MPI
  • Loading branch information
asteny committed Jan 9, 2025
1 parent 21b8ac2 commit 4b9bb16
Show file tree
Hide file tree
Showing 13 changed files with 72 additions and 70 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ CHART_OPERATOR_CRDS_PATH = $(CHART_PATH)/soperator-crds
CHART_CLUSTER_PATH = $(CHART_PATH)/slurm-cluster
CHART_STORAGE_PATH = $(CHART_PATH)/slurm-cluster-storage

SLURM_VERSION = 24.05.2
SLURM_VERSION = 24.05.5
UBUNTU_VERSION = jammy
VERSION = $(shell cat VERSION)

Expand Down
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,10 @@ This helps cluster administrators and users monitor resource utilization, enforc
implemented it yet.
- **Single-partition clusters**. Slurm's ability to split clusters into several partitions isn't supported now.
- **Software versions**. The list of software versions we currently support is quite short.
- Linux: Ubuntu [20.04](https://releases.ubuntu.com/focal/) and
[22.04](https://releases.ubuntu.com/jammy/).
- Slurm: versions `23.11.6` and `24.05.3`.
- CUDA: version [12.2.2](https://developer.nvidia.com/cuda-12-2-2-download-archive).
- Kubernetes: >= [1.29](https://kubernetes.io/blog/2023/08/15/kubernetes-v1-28-release/).
- Linux: Ubuntu [22.04](https://releases.ubuntu.com/jammy/).
- Slurm: versions `24.05.5`.
- CUDA: version [12.4.1](https://developer.nvidia.com/cuda-12-4-1-download-archive).
- Kubernetes: >= [1.29](https://kubernetes.io/blog/2023/12/13/kubernetes-v1-29-release/).
- Versions of some preinstalled software packages can't be changed.


Expand Down
9 changes: 4 additions & 5 deletions docs/limitations.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ equipped with different GPU models, use different container images, have differe

### Software versions
Our list of supported software versions is pretty short right now:
- Linux distribution: Ubuntu [20.04](https://releases.ubuntu.com/focal/) and
[22.04](https://releases.ubuntu.com/jammy/).
- Slurm: versions `23.11.6` and `24.05.3`.
- CUDA: version [12.2.2](https://developer.nvidia.com/cuda-12-2-2-download-archive).
- Kubernetes: >= [1.28](https://kubernetes.io/blog/2023/08/15/kubernetes-v1-28-release/).
- Linux distribution: Ubuntu [22.04](https://releases.ubuntu.com/jammy/).
- Slurm: versions `24.05.5`.
- CUDA: version [12.4.1](https://developer.nvidia.com/cuda-12-4-1-download-archive).
- Kubernetes: >= [1.29](https://kubernetes.io/blog/2023/12/13/kubernetes-v1-29-release/).
- Versions of some preinstalled software packages can't be changed.

Other versions may also be supported, but we haven't checked it yet. It would be cool if someone from the community
Expand Down
18 changes: 9 additions & 9 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -402,13 +402,13 @@ telemetry: {}
# otelCollectorPort: 8429

images:
slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.16.1-jammy-slurm24.05.2"
slurmrestd: "cr.eu-north1.nebius.cloud/soperator/slurmrestd:1.16.1-jammy-slurm24.05.2"
slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.16.1-jammy-slurm24.05.2"
sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.16.1-jammy-slurm24.05.2"
munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.16.1-jammy-slurm24.05.2"
populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.16.1-jammy-slurm24.05.2"
ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.16.1-jammy-slurm24.05.2"
slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.16.1-jammy-slurm24.05.2"
exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.16.1-jammy-slurm24.05.2"
slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.16.1-jammy-slurm24.05.5"
slurmrestd: "cr.eu-north1.nebius.cloud/soperator/slurmrestd:1.16.1-jammy-slurm24.05.5"
slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.16.1-jammy-slurm24.05.5"
sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.16.1-jammy-slurm24.05.5"
munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.16.1-jammy-slurm24.05.5"
populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.16.1-jammy-slurm24.05.5"
ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.16.1-jammy-slurm24.05.5"
slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.16.1-jammy-slurm24.05.5"
exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.16.1-jammy-slurm24.05.5"
mariaDB: "docker-registry1.mariadb.com/library/mariadb:11.4.3"
4 changes: 2 additions & 2 deletions images/accounting/slurmdbd.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ ARG BASE_IMAGE=ubuntu:jammy

FROM $BASE_IMAGE AS controller_slurmdbd

ARG SLURM_VERSION=24.05.2
ARG CUDA_VERSION=12.2.2
ARG SLURM_VERSION=24.05.5
ARG CUDA_VERSION=12.4.1

ARG DEBIAN_FRONTEND=noninteractive

Expand Down
10 changes: 10 additions & 0 deletions images/common/scripts/install_openmpi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

OPENMPI_VERSION=4.1.7a1-1.2310055
OFED_VERSION=23.10-2.1.3.1
DISTRO=$(. /etc/os-release; echo "$ID""$VERSION_ID")
cd /etc/apt/sources.list.d || exit
wget https://linux.mellanox.com/public/repo/mlnx_ofed/$OFED_VERSION/"$DISTRO"/mellanox_mlnx_ofed.list
wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add -
apt update
apt install openmpi="$OPENMPI_VERSION"
11 changes: 0 additions & 11 deletions images/common/scripts/install_pmix.sh

This file was deleted.

18 changes: 11 additions & 7 deletions images/controller/slurmctld.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ ARG BASE_IMAGE=ubuntu:jammy

FROM $BASE_IMAGE AS controller_slurmctld

ARG SLURM_VERSION=24.05.2
ARG CUDA_VERSION=12.2.2
ARG SLURM_VERSION=24.05.5
ARG CUDA_VERSION=12.4.1
ARG OPENMPI_VERSION=4.1.7a1

ARG DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -42,11 +43,14 @@ RUN apt-get update && \
lsof \
daemontools

# Install PMIx
COPY common/scripts/install_pmix.sh /opt/bin/
RUN chmod +x /opt/bin/install_pmix.sh && \
/opt/bin/install_pmix.sh && \
rm /opt/bin/install_pmix.sh
# Install OpenMPI
COPY common/scripts/install_openmpi.sh /opt/bin/
RUN chmod +x /opt/bin/install_openmpi.sh && \
/opt/bin/install_openmpi.sh && \
rm /opt/bin/install_openmpi.sh

ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib
ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin

# TODO: Install only necessary packages
# Download and install Slurm packages
Expand Down
4 changes: 2 additions & 2 deletions images/exporter/exporter.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
# Second stage: Build image for the prometheus-slurm-exporter
FROM $BASE_IMAGE AS exporter

ARG SLURM_VERSION=24.05.2
ARG CUDA_VERSION=12.2.2
ARG SLURM_VERSION=24.05.5
ARG CUDA_VERSION=12.4.1

# TODO: Install only those dependencies that are required for running slurm exporter
# Install dependencies
Expand Down
29 changes: 13 additions & 16 deletions images/jail/jail.dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# BASE_IMAGE defined here for second multistage build
ARG BASE_IMAGE=ghcr.io/asteny/cuda_base:12.2.2
ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

# First stage: Build the gpubench application
FROM golang:1.22 AS gpubench_builder
Expand All @@ -23,12 +23,13 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
#######################################################################################################################
# Second stage: Build jail image

ARG BASE_IMAGE=ghcr.io/asteny/cuda_base:12.2.2
ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

FROM $BASE_IMAGE AS jail

ARG SLURM_VERSION=24.05.2
ARG CUDA_VERSION=12.2.2
ARG SLURM_VERSION=24.05.5
ARG CUDA_VERSION=12.4.1
ARG OPENMPI_VERSION=4.1.7a1

ARG DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -108,11 +109,14 @@ RUN chown 0:0 /etc/enroot/enroot.conf && chmod 644 /etc/enroot/enroot.conf
# Create directory for enroot runtime data that will be mounted from the host
RUN mkdir -p -m 777 /usr/share/enroot/enroot-data

# Install PMIx
COPY common/scripts/install_pmix.sh /opt/bin/
RUN chmod +x /opt/bin/install_pmix.sh && \
/opt/bin/install_pmix.sh && \
rm /opt/bin/install_pmix.sh
# Install OpenMPI
COPY common/scripts/install_openmpi.sh /opt/bin/
RUN chmod +x /opt/bin/install_openmpi.sh && \
/opt/bin/install_openmpi.sh && \
rm /opt/bin/install_openmpi.sh

ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib
ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin

# TODO: Install only necessary packages
# Download and install Slurm packages
Expand Down Expand Up @@ -146,13 +150,6 @@ RUN chmod +x /opt/bin/install_nvtop.sh && \
/opt/bin/install_nvtop.sh && \
rm /opt/bin/install_nvtop.sh

# Download and install NCCL packages
RUN wget -P /tmp https://github.com/nebius/slurm-deb-packages/releases/download/$CUDA_VERSION-$(grep 'VERSION_CODENAME' /etc/os-release | cut -d= -f2)-slurm$SLURM_VERSION/libnccl2_2.22.3-1+cuda12.2_amd64.deb && \
wget -P /tmp https://github.com/nebius/slurm-deb-packages/releases/download/$CUDA_VERSION-$(grep 'VERSION_CODENAME' /etc/os-release | cut -d= -f2)-slurm$SLURM_VERSION/libnccl-dev_2.22.3-1+cuda12.2_amd64.deb && \
dpkg -i /tmp/libnccl2_2.22.3-1+cuda12.2_amd64.deb && \
dpkg -i /tmp/libnccl-dev_2.22.3-1+cuda12.2_amd64.deb && \
rm -rf /tmp/*.deb

# Download NCCL tests executables
RUN wget -P /tmp https://github.com/nebius/slurm-deb-packages/releases/download/$CUDA_VERSION-$(grep 'VERSION_CODENAME' /etc/os-release | cut -d= -f2)-slurm$SLURM_VERSION/nccl-tests-perf.tar.gz && \
tar -xvzf /tmp/nccl-tests-perf.tar.gz -C /usr/bin && \
Expand Down
4 changes: 2 additions & 2 deletions images/nccl_benchmark/nccl_benchmark.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ ARG BASE_IMAGE=ubuntu:jammy

FROM $BASE_IMAGE AS nccl_benchmark

ARG SLURM_VERSION=24.05.2
ARG CUDA_VERSION=12.2.2
ARG SLURM_VERSION=24.05.5
ARG CUDA_VERSION=12.4.1

ARG DEBIAN_FRONTEND=noninteractive

Expand Down
4 changes: 2 additions & 2 deletions images/restd/slurmrestd.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ ARG BASE_IMAGE=ubuntu:jammy

FROM $BASE_IMAGE AS slurmrestd

ARG SLURM_VERSION=24.05.2
ARG CUDA_VERSION=12.2.2
ARG SLURM_VERSION=24.05.5
ARG CUDA_VERSION=12.4.1

ARG DEBIAN_FRONTEND=noninteractive

Expand Down
20 changes: 12 additions & 8 deletions images/worker/slurmd.dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
ARG BASE_IMAGE=ghcr.io/asteny/cuda_base:12.2.2
ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

FROM $BASE_IMAGE AS worker_slurmd

ARG SLURM_VERSION=24.05.2
ARG CUDA_VERSION=12.2.2
ARG SLURM_VERSION=24.05.5
ARG CUDA_VERSION=12.4.1
ARG OPENMPI_VERSION=4.1.7a1

ARG DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -51,11 +52,14 @@ RUN apt-get update && \
supervisor \
openssh-server

# Install PMIx
COPY common/scripts/install_pmix.sh /opt/bin/
RUN chmod +x /opt/bin/install_pmix.sh && \
/opt/bin/install_pmix.sh && \
rm /opt/bin/install_pmix.sh
# Install OpenMPI
COPY common/scripts/install_openmpi.sh /opt/bin/
RUN chmod +x /opt/bin/install_openmpi.sh && \
/opt/bin/install_openmpi.sh && \
rm /opt/bin/install_openmpi.sh

ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib
ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin

# TODO: Install only necessary packages
# Download and install Slurm packages
Expand Down

0 comments on commit 4b9bb16

Please sign in to comment.