diff --git a/Makefile b/Makefile index a75fa4cf..1e425284 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ CHART_OPERATOR_CRDS_PATH = $(CHART_PATH)/soperator-crds CHART_CLUSTER_PATH = $(CHART_PATH)/slurm-cluster CHART_STORAGE_PATH = $(CHART_PATH)/slurm-cluster-storage -SLURM_VERSION = 24.05.2 +SLURM_VERSION = 24.05.5 UBUNTU_VERSION = jammy VERSION = $(shell cat VERSION) diff --git a/README.md b/README.md index 8b1dc88e..93725435 100644 --- a/README.md +++ b/README.md @@ -111,11 +111,10 @@ This helps cluster administrators and users monitor resource utilization, enforc implemented it yet. - **Single-partition clusters**. Slurm's ability to split clusters into several partitions isn't supported now. - **Software versions**. The list of software versions we currently support is quite short. - - Linux: Ubuntu [20.04](https://releases.ubuntu.com/focal/) and - [22.04](https://releases.ubuntu.com/jammy/). - - Slurm: versions `23.11.6` and `24.05.3`. - - CUDA: version [12.2.2](https://developer.nvidia.com/cuda-12-2-2-download-archive). - - Kubernetes: >= [1.29](https://kubernetes.io/blog/2023/08/15/kubernetes-v1-28-release/). + - Linux: Ubuntu [22.04](https://releases.ubuntu.com/jammy/). + - Slurm: versions `24.05.5`. + - CUDA: version [12.4.1](https://developer.nvidia.com/cuda-12-4-1-download-archive). + - Kubernetes: >= [1.29](https://kubernetes.io/blog/2023/12/13/kubernetes-v1-29-release/). - Versions of some preinstalled software packages can't be changed. diff --git a/docs/limitations.md b/docs/limitations.md index c67ca975..d0c0d303 100644 --- a/docs/limitations.md +++ b/docs/limitations.md @@ -30,11 +30,10 @@ equipped with different GPU models, use different container images, have differe ### Software versions Our list of supported software versions is pretty short right now: -- Linux distribution: Ubuntu [20.04](https://releases.ubuntu.com/focal/) and -[22.04](https://releases.ubuntu.com/jammy/). -- Slurm: versions `23.11.6` and `24.05.3`. -- CUDA: version [12.2.2](https://developer.nvidia.com/cuda-12-2-2-download-archive). -- Kubernetes: >= [1.28](https://kubernetes.io/blog/2023/08/15/kubernetes-v1-28-release/). +- Linux distribution: Ubuntu [22.04](https://releases.ubuntu.com/jammy/). +- Slurm: versions `24.05.5`. +- CUDA: version [12.4.1](https://developer.nvidia.com/cuda-12-4-1-download-archive). +- Kubernetes: >= [1.29](https://kubernetes.io/blog/2023/12/13/kubernetes-v1-29-release/). - Versions of some preinstalled software packages can't be changed. Other versions may also be supported, but we haven't checked it yet. It would be cool if someone from the community diff --git a/helm/slurm-cluster/values.yaml b/helm/slurm-cluster/values.yaml index 298065fb..da06a312 100644 --- a/helm/slurm-cluster/values.yaml +++ b/helm/slurm-cluster/values.yaml @@ -402,13 +402,13 @@ telemetry: {} # otelCollectorPort: 8429 images: - slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.16.1-jammy-slurm24.05.2" - slurmrestd: "cr.eu-north1.nebius.cloud/soperator/slurmrestd:1.16.1-jammy-slurm24.05.2" - slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.16.1-jammy-slurm24.05.2" - sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.16.1-jammy-slurm24.05.2" - munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.16.1-jammy-slurm24.05.2" - populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.16.1-jammy-slurm24.05.2" - ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.16.1-jammy-slurm24.05.2" - slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.16.1-jammy-slurm24.05.2" - exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.16.1-jammy-slurm24.05.2" + slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.16.1-jammy-slurm24.05.5" + slurmrestd: "cr.eu-north1.nebius.cloud/soperator/slurmrestd:1.16.1-jammy-slurm24.05.5" + slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.16.1-jammy-slurm24.05.5" + sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.16.1-jammy-slurm24.05.5" + munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.16.1-jammy-slurm24.05.5" + populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.16.1-jammy-slurm24.05.5" + ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.16.1-jammy-slurm24.05.5" + slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.16.1-jammy-slurm24.05.5" + exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.16.1-jammy-slurm24.05.5" mariaDB: "docker-registry1.mariadb.com/library/mariadb:11.4.3" diff --git a/images/accounting/slurmdbd.dockerfile b/images/accounting/slurmdbd.dockerfile index e7e2ac61..3a0f4fc6 100644 --- a/images/accounting/slurmdbd.dockerfile +++ b/images/accounting/slurmdbd.dockerfile @@ -2,8 +2,8 @@ ARG BASE_IMAGE=ubuntu:jammy FROM $BASE_IMAGE AS controller_slurmdbd -ARG SLURM_VERSION=24.05.2 -ARG CUDA_VERSION=12.2.2 +ARG SLURM_VERSION=24.05.5 +ARG CUDA_VERSION=12.4.1 ARG DEBIAN_FRONTEND=noninteractive diff --git a/images/common/scripts/install_openmpi.sh b/images/common/scripts/install_openmpi.sh new file mode 100644 index 00000000..a340264a --- /dev/null +++ b/images/common/scripts/install_openmpi.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +OPENMPI_VERSION=4.1.7a1-1.2310055 +OFED_VERSION=23.10-2.1.3.1 +DISTRO=$(. /etc/os-release; echo "$ID""$VERSION_ID") +cd /etc/apt/sources.list.d || exit +wget https://linux.mellanox.com/public/repo/mlnx_ofed/$OFED_VERSION/"$DISTRO"/mellanox_mlnx_ofed.list +wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - +apt update +apt install openmpi="$OPENMPI_VERSION" diff --git a/images/common/scripts/install_pmix.sh b/images/common/scripts/install_pmix.sh deleted file mode 100644 index c2613d3a..00000000 --- a/images/common/scripts/install_pmix.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -export PMIX_VERSION=5.0.3 -cd /usr/src && \ -wget https://github.com/openpmix/openpmix/releases/download/v${PMIX_VERSION}/pmix-${PMIX_VERSION}.tar.gz && \ -tar -xzvf pmix-${PMIX_VERSION}.tar.gz && \ -rm -rf pmix-${PMIX_VERSION}.tar.gz && \ -cd /usr/src/pmix-${PMIX_VERSION} && \ -./configure && \ -make -j"$(nproc)" && \ -make install diff --git a/images/controller/slurmctld.dockerfile b/images/controller/slurmctld.dockerfile index 34e424d3..631543f6 100644 --- a/images/controller/slurmctld.dockerfile +++ b/images/controller/slurmctld.dockerfile @@ -2,8 +2,9 @@ ARG BASE_IMAGE=ubuntu:jammy FROM $BASE_IMAGE AS controller_slurmctld -ARG SLURM_VERSION=24.05.2 -ARG CUDA_VERSION=12.2.2 +ARG SLURM_VERSION=24.05.5 +ARG CUDA_VERSION=12.4.1 +ARG OPENMPI_VERSION=4.1.7a1 ARG DEBIAN_FRONTEND=noninteractive @@ -42,11 +43,14 @@ RUN apt-get update && \ lsof \ daemontools -# Install PMIx -COPY common/scripts/install_pmix.sh /opt/bin/ -RUN chmod +x /opt/bin/install_pmix.sh && \ - /opt/bin/install_pmix.sh && \ - rm /opt/bin/install_pmix.sh +# Install OpenMPI +COPY common/scripts/install_openmpi.sh /opt/bin/ +RUN chmod +x /opt/bin/install_openmpi.sh && \ + /opt/bin/install_openmpi.sh && \ + rm /opt/bin/install_openmpi.sh + +ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib +ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin # TODO: Install only necessary packages # Download and install Slurm packages diff --git a/images/exporter/exporter.dockerfile b/images/exporter/exporter.dockerfile index 60201027..dc144a29 100644 --- a/images/exporter/exporter.dockerfile +++ b/images/exporter/exporter.dockerfile @@ -29,8 +29,8 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ # Second stage: Build image for the prometheus-slurm-exporter FROM $BASE_IMAGE AS exporter -ARG SLURM_VERSION=24.05.2 -ARG CUDA_VERSION=12.2.2 +ARG SLURM_VERSION=24.05.5 +ARG CUDA_VERSION=12.4.1 # TODO: Install only those dependencies that are required for running slurm exporter # Install dependencies diff --git a/images/jail/jail.dockerfile b/images/jail/jail.dockerfile index 04b21480..a46a368a 100644 --- a/images/jail/jail.dockerfile +++ b/images/jail/jail.dockerfile @@ -1,5 +1,5 @@ # BASE_IMAGE defined here for second multistage build -ARG BASE_IMAGE=ghcr.io/asteny/cuda_base:12.2.2 +ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 # First stage: Build the gpubench application FROM golang:1.22 AS gpubench_builder @@ -23,12 +23,13 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ ####################################################################################################################### # Second stage: Build jail image -ARG BASE_IMAGE=ghcr.io/asteny/cuda_base:12.2.2 +ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 FROM $BASE_IMAGE AS jail -ARG SLURM_VERSION=24.05.2 -ARG CUDA_VERSION=12.2.2 +ARG SLURM_VERSION=24.05.5 +ARG CUDA_VERSION=12.4.1 +ARG OPENMPI_VERSION=4.1.7a1 ARG DEBIAN_FRONTEND=noninteractive @@ -108,11 +109,14 @@ RUN chown 0:0 /etc/enroot/enroot.conf && chmod 644 /etc/enroot/enroot.conf # Create directory for enroot runtime data that will be mounted from the host RUN mkdir -p -m 777 /usr/share/enroot/enroot-data -# Install PMIx -COPY common/scripts/install_pmix.sh /opt/bin/ -RUN chmod +x /opt/bin/install_pmix.sh && \ - /opt/bin/install_pmix.sh && \ - rm /opt/bin/install_pmix.sh +# Install OpenMPI +COPY common/scripts/install_openmpi.sh /opt/bin/ +RUN chmod +x /opt/bin/install_openmpi.sh && \ + /opt/bin/install_openmpi.sh && \ + rm /opt/bin/install_openmpi.sh + +ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib +ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin # TODO: Install only necessary packages # Download and install Slurm packages @@ -146,13 +150,6 @@ RUN chmod +x /opt/bin/install_nvtop.sh && \ /opt/bin/install_nvtop.sh && \ rm /opt/bin/install_nvtop.sh -# Download and install NCCL packages -RUN wget -P /tmp https://github.com/nebius/slurm-deb-packages/releases/download/$CUDA_VERSION-$(grep 'VERSION_CODENAME' /etc/os-release | cut -d= -f2)-slurm$SLURM_VERSION/libnccl2_2.22.3-1+cuda12.2_amd64.deb && \ - wget -P /tmp https://github.com/nebius/slurm-deb-packages/releases/download/$CUDA_VERSION-$(grep 'VERSION_CODENAME' /etc/os-release | cut -d= -f2)-slurm$SLURM_VERSION/libnccl-dev_2.22.3-1+cuda12.2_amd64.deb && \ - dpkg -i /tmp/libnccl2_2.22.3-1+cuda12.2_amd64.deb && \ - dpkg -i /tmp/libnccl-dev_2.22.3-1+cuda12.2_amd64.deb && \ - rm -rf /tmp/*.deb - # Download NCCL tests executables RUN wget -P /tmp https://github.com/nebius/slurm-deb-packages/releases/download/$CUDA_VERSION-$(grep 'VERSION_CODENAME' /etc/os-release | cut -d= -f2)-slurm$SLURM_VERSION/nccl-tests-perf.tar.gz && \ tar -xvzf /tmp/nccl-tests-perf.tar.gz -C /usr/bin && \ diff --git a/images/nccl_benchmark/nccl_benchmark.dockerfile b/images/nccl_benchmark/nccl_benchmark.dockerfile index 73ec50df..7ef6b447 100644 --- a/images/nccl_benchmark/nccl_benchmark.dockerfile +++ b/images/nccl_benchmark/nccl_benchmark.dockerfile @@ -2,8 +2,8 @@ ARG BASE_IMAGE=ubuntu:jammy FROM $BASE_IMAGE AS nccl_benchmark -ARG SLURM_VERSION=24.05.2 -ARG CUDA_VERSION=12.2.2 +ARG SLURM_VERSION=24.05.5 +ARG CUDA_VERSION=12.4.1 ARG DEBIAN_FRONTEND=noninteractive diff --git a/images/restd/slurmrestd.dockerfile b/images/restd/slurmrestd.dockerfile index 7d86f070..8f8f492a 100644 --- a/images/restd/slurmrestd.dockerfile +++ b/images/restd/slurmrestd.dockerfile @@ -2,8 +2,8 @@ ARG BASE_IMAGE=ubuntu:jammy FROM $BASE_IMAGE AS slurmrestd -ARG SLURM_VERSION=24.05.2 -ARG CUDA_VERSION=12.2.2 +ARG SLURM_VERSION=24.05.5 +ARG CUDA_VERSION=12.4.1 ARG DEBIAN_FRONTEND=noninteractive diff --git a/images/worker/slurmd.dockerfile b/images/worker/slurmd.dockerfile index 00494aea..53b8aa26 100644 --- a/images/worker/slurmd.dockerfile +++ b/images/worker/slurmd.dockerfile @@ -1,9 +1,10 @@ -ARG BASE_IMAGE=ghcr.io/asteny/cuda_base:12.2.2 +ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 FROM $BASE_IMAGE AS worker_slurmd -ARG SLURM_VERSION=24.05.2 -ARG CUDA_VERSION=12.2.2 +ARG SLURM_VERSION=24.05.5 +ARG CUDA_VERSION=12.4.1 +ARG OPENMPI_VERSION=4.1.7a1 ARG DEBIAN_FRONTEND=noninteractive @@ -51,11 +52,14 @@ RUN apt-get update && \ supervisor \ openssh-server -# Install PMIx -COPY common/scripts/install_pmix.sh /opt/bin/ -RUN chmod +x /opt/bin/install_pmix.sh && \ - /opt/bin/install_pmix.sh && \ - rm /opt/bin/install_pmix.sh +# Install OpenMPI +COPY common/scripts/install_openmpi.sh /opt/bin/ +RUN chmod +x /opt/bin/install_openmpi.sh && \ + /opt/bin/install_openmpi.sh && \ + rm /opt/bin/install_openmpi.sh + +ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib +ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin # TODO: Install only necessary packages # Download and install Slurm packages