forked from aws/aws-ofi-nccl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
stack-info: PR: aws#592, branch: aws-nslick/stack/33
- Loading branch information
1 parent
ba9a401
commit 3f8ec64
Showing
24 changed files
with
1,047 additions
and
4 deletions.
There are no files selected for viewing
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# | ||
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
# What is this? | ||
# This overrides defaults for caching, targets, tags, platforms, etc. This | ||
# file is expected to be symlinked into the root of the tree to select the | ||
# builder used by `docker buildx bake'. | ||
# | ||
# This file configures local builds. | ||
# | ||
# See https://docs.docker.com/build/bake/reference/#file-format | ||
|
||
variable "VERSION" { default = "master" } | ||
|
||
target "efainstaller" { | ||
platforms = [ "linux/amd64", "linux/arm64" ] | ||
context = "." | ||
dockerfile = ".docker/containers/Dockerfile.efa" | ||
output = ["type=cacheonly"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# | ||
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
|
||
FROM alpine:latest AS extractor | ||
RUN apk add --no-cache tar | ||
|
||
FROM extractor AS extracted | ||
ARG INSTALLER_PREFIX | ||
ENV INSTALLER_PREFIX=${INSTALLER_PREFIX} | ||
COPY --from=efa_installer_tarball aws-efa-installer*.tar.gz . | ||
RUN tar xvf *.tar.gz \ | ||
--wildcards "aws-efa-installer/*.txt" \ | ||
--wildcards "aws-efa-installer/*.sh" \ | ||
--wildcards "aws-efa-installer/**/${INSTALLER_PREFIX}/$(uname -m)" && \ | ||
rm -rf *.tar.gz | ||
|
||
FROM distro_image AS installed | ||
ARG ENABLE_EFA_INSTALLER_DEBUG_INFO=0 | ||
ARG ENABLE_MPI4=0 | ||
ARG ENABLE_MPI5=0 | ||
|
||
ENV ENABLE_EFA_INSTALLER_DEBUG_INFO=${ENABLE_EFA_INSTALLER_DEBUG_INFO} | ||
ENV ENABLE_MPI4=${ENABLE_MPI4} | ||
ENV ENABLE_MPI5=${ENABLE_MPI5} | ||
|
||
RUN mkdir /aws-efa-installer | ||
COPY --from=extracted /aws-efa-installer /aws-efa-installer | ||
|
||
# XXX: the EFA installer script should refresh the package caches itself. | ||
# XXX: the EFA installer depends on util-linux, which many contianers don't have. | ||
RUN ( ! command -v getopt && ( apt install -y util-linux || \ | ||
dnf -y install util-linux || \ | ||
yum install -y util-linux ) || /bin/true) && \ | ||
((command -v apt-get && apt-get update -y) || /bin/true ) && \ | ||
((command -v yum && yum update -y ) || /bin/true ) && \ | ||
((command -v dnf && dnf -y update ) || /bin/true ) && \ | ||
cd /aws-efa-installer && \ | ||
./efa_installer.sh -y -n -l -k -g \ | ||
$(test "$ENABLE_EFA_INSTALLER_DEBUG_INFO" -eq "1" && echo "-d") \ | ||
$(test "$ENABLE_MPI4" -eq "1" && echo "--mpi openmpi4") \ | ||
$(test "$ENABLE_MPI5" -eq "1" && echo "--mpi openmpi5") && \ | ||
cd && rm -rf /aws-efa-installer && \ | ||
((command -v apt-get && apt-get purge -y && apt-get clean -y) || /bin/true ) && \ | ||
((command -v dnf && dnf clean all -y) || /bin/true ) && \ | ||
((command -v yum && yum clean all -y) || /bin/true ) && \ | ||
((command -v zypper && zypper clean ) || /bin/true ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# | ||
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
|
||
ARG FAMILY=fedora | ||
ARG VERSION=rawhide | ||
ARG VARIANT=cuda | ||
ARG CUDA_DISTRO | ||
ARG AWS_BUILD | ||
ARG ENABLE_POWERTOOLS | ||
|
||
# Install EFA-installer deps. | ||
FROM ${FAMILY}:${VERSION} AS builder | ||
ARG CUDA_DISTRO | ||
ARG ENABLE_POWERTOOLS | ||
ENV CUDA_DISTRO=${CUDA_DISTRO} | ||
ENV ENABLE_POWERTOOLS=${ENABLE_POWERTOOLS} | ||
# Add NVIDIA repo for CUDA builds. | ||
COPY --from=efainstaller / / | ||
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ | ||
--mount=type=cache,target=/var/cache/dnf,sharing=locked \ | ||
bash -c "cd /aws-efa-installer && dnf install -y gcc rpmdevtools rpmlint dnf-plugins-core util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \ | ||
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \ | ||
( test "${ENABLE_POWERTOOLS}" = "1" && sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo || /bin/true ) && \ | ||
dnf -y update && dnf -y upgrade | ||
RUN rpmdev-setuptree | ||
|
||
FROM builder AS environment | ||
ARG VARIANT | ||
ARG AWS_BUILD | ||
ENV VARIANT=${VARIANT} | ||
ENV AWS_BUILD=${AWS_BUILD} | ||
COPY --from=srpm . . | ||
RUN yum search hwloc | ||
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros | ||
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros | ||
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ | ||
--mount=type=cache,target=/var/cache/dnf,sharing=locked \ | ||
dnf -y install cuda-cudart-devel-12-6 && dnf -y builddep *.src.rpm && rpmbuild --rebuild *.src.rpm | ||
|
||
FROM scratch | ||
COPY --from=environment /root/rpmbuild/RPMS/**/* / |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# | ||
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
|
||
ARG FAMILY=ubuntu | ||
ARG VERSION=latest | ||
ARG CUDA_DISTRO | ||
ARG DEBIAN_FRONTEND=noninteractive | ||
ARG AWS_BUILD | ||
|
||
FROM ${FAMILY}:${VERSION} AS build | ||
ARG CUDA_DISTRO | ||
ENV CUDA_DISTRO=${CUDA_DISTRO} | ||
ARG AWS_BUILD=0 | ||
ENV AWS_BUILD=${AWS_BUILD} | ||
|
||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | ||
--mount=type=cache,target=/var/lib/apt,sharing=locked \ | ||
apt-get update -y && apt-get install wget -y | ||
|
||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb | ||
|
||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | ||
--mount=type=cache,target=/var/lib/apt,sharing=locked \ | ||
dpkg -i cuda-keyring_1.1-1_all.deb | ||
|
||
COPY --from=efainstaller / . | ||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | ||
--mount=type=cache,target=/var/lib/apt,sharing=locked \ | ||
bash -c "apt-get update -y && cd /aws-efa-installer && ./efa_installer.sh /efa_installer.sh -n -l -k -d -y && apt-get install -y autoconf automake libtool gcc g++ git libhwloc-dev make && rm -rf /aws-efa-installer" | ||
|
||
COPY --from=makedist / . | ||
RUN tar xvf ./aws-ofi-nccl*.tar.gz -C . | ||
RUN cd aws-ofi-nccl* && \ | ||
./configure --$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") \ | ||
--prefix=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \ | ||
--with-libfabric=/opt/amazon/efa \ | ||
--disable-tests \ | ||
--$(test "$AWS_BUILD" -eq 0 && echo -n "disable" || echo -n "enable")-platform-aws \ | ||
--with-mpi=no && make -j && make install | ||
|
||
FROM ubuntu:latest AS packager | ||
ARG FAMILY | ||
ARG VERSION | ||
ARG AWS_BUILD=0 | ||
ENV AWS_BUILD=${AWS_BUILD} | ||
ENV FAMILY=${FAMILY} | ||
ENV VERSION=${VERSION} | ||
COPY --from=build /opt/amazon/ /opt/amazon/ | ||
RUN find /opt/amazon/ | grep -E \.la$ | xargs rm | ||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | ||
--mount=type=cache,target=/var/lib/apt,sharing=locked \ | ||
apt-get update -y && apt-get install -y ruby tar squashfs-tools binutils && gem install fpm | ||
RUN fpm \ | ||
-s dir -t deb \ | ||
--license Apache2.0 \ | ||
-p /libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")-${FAMILY}-${VERSION}.deb \ | ||
--name nccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \ | ||
/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")/=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") | ||
|
||
FROM scratch | ||
COPY --from=packager /libnccl-net-ofi* / | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
FROM alpine:latest AS downloader | ||
ARG CUDA_DISTRO | ||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb | ||
|
||
FROM base_image | ||
ARG CUDA_TOOLKIT_VERSION_SUFFIX | ||
ENV CUDA_TOOLKIT_VERSION_SUFFIX=${CUDA_TOOLKIT_VERSION_SUFFIX} | ||
COPY --from=downloader cuda-keyring*.deb . | ||
RUN apt-get update -y && apt-get install -y ca-certificates && dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring*.deb && \ | ||
apt-get update -y && apt-get install -y cuda-cudart-dev-${CUDA_TOOLKIT_VERSION_SUFFIX} && \ | ||
apt-get purge -y && apt-get clean -y |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
FROM distro_image | ||
RUN mkdir /aws-efa-installer | ||
COPY --from=efa_installer_contents /aws-efa-installer /aws-efa-installer | ||
|
||
# XXX: EFA installer doesn't refresh the package caches if they're unpopulated, | ||
# as they always are in container images. | ||
# | ||
# XXX: EFA installer depends on util-linux, which many contianers don't have. | ||
RUN (command -v getopt || apt install -y util-linux 2>/dev/null || \ | ||
dnf -y install util-linux 2>/dev/null || yum -y install util-linux 2>/dev/null) && \ | ||
(command -v apt-get && apt-get update -y || /bin/true ) && \ | ||
(! command -v yum || yum update -y ) && \ | ||
cd /aws-efa-installer && \ | ||
./efa_installer.sh -d -y -n -l -k -g --mpi openmpi4,openmpi5 && \ | ||
cd && rm -rf /aws-efa-installer && (command -v apt-get && apt-get purge -y && apt-get clean -y || /bin/true ) \ | ||
(command -v dnf && dnf clean -y || /bin/true ) \ | ||
(command -v yum && yum clean -y || /bin/true ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# | ||
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
|
||
FROM base_image AS buildenv | ||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | ||
--mount=type=cache,target=/var/lib/apt,sharing=locked \ | ||
apt-get update -y && apt-get install -y automake autoconf libtool libhwloc-dev | ||
|
||
FROM buildenv AS distbuilder | ||
ARG ACCELERATOR | ||
ENV ACCELERATOR=${ACCELERATOR} | ||
COPY ../ /proj | ||
WORKDIR /proj | ||
RUN autoreconf -ivf && \ | ||
./configure --with-libfabric=/opt/amazon/efa \ | ||
--$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") && \ | ||
make -j dist | ||
|
||
FROM scratch | ||
COPY --from=distbuilder /proj/aws-ofi-nccl*.tar.gz / |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# | ||
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
|
||
FROM fedora:rawhide AS packitimg | ||
RUN dnf install -y packit mock | ||
|
||
FROM packitimg AS srpm | ||
RUN mkdir /proj | ||
WORKDIR /proj | ||
COPY --from=src . . | ||
COPY --from=makedist . . | ||
RUN packit srpm | ||
|
||
FROM scratch | ||
COPY --from=srpm /proj/*.src.rpm / |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# | ||
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. | ||
# | ||
# See LICENSE.txt for license information | ||
# | ||
|
||
ARG FAMILY=amazonlinux | ||
ARG VERSION=2 | ||
ARG VARIANT=cuda | ||
ARG CUDA_DISTRO | ||
ARG AWS_BUILD | ||
|
||
# Install EFA-installer deps. | ||
FROM ${FAMILY}:${VERSION} AS builder | ||
ARG CUDA_DISTRO | ||
ENV CUDA_DISTRO=${CUDA_DISTRO} | ||
# Add NVIDIA repo for CUDA builds. | ||
COPY --from=efainstaller / / | ||
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ | ||
--mount=type=cache,target=/var/cache/dnf,sharing=locked \ | ||
bash -c "cd /aws-efa-installer && yum install -y gcc rpmdevtools rpmlint yum-utils util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \ | ||
yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \ | ||
yum update -y | ||
RUN rpmdev-setuptree | ||
|
||
FROM builder AS environment | ||
ARG VARIANT | ||
ARG AWS_BUILD | ||
ARG TOOLKIT_VERSION=12-6 | ||
ENV VARIANT=${VARIANT} | ||
ENV AWS_BUILD=${AWS_BUILD} | ||
ENV TOOLKIT_VERSION=${TOOLKIT_VERSION} | ||
COPY --from=srpm . . | ||
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros | ||
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros | ||
RUN echo "%_cuda_toolkit_version ${TOOLKIT_VERSION}" >> ~/.rpmmacros | ||
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ | ||
--mount=type=cache,target=/var/cache/dnf,sharing=locked \ | ||
yum install -y cuda-cudart-devel-${TOOLKIT_VERSION} && yum-builddep -y *.src.rpm && rpmbuild --rebuild *.src.rpm | ||
|
||
FROM scratch | ||
COPY --from=environment /root/rpmbuild/RPMS/**/* / |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
--- | ||
apiVersion: eksctl.io/v1alpha5 | ||
kind: ClusterConfig | ||
metadata: | ||
name: ect-build | ||
region: us-west-1 | ||
version: "1.30" | ||
tags: | ||
karpenter.sh/discovery: ect-build | ||
karpenter: | ||
version: '1.0.2' | ||
withSpotInterruptionQueue: true | ||
createServiceAccount: true | ||
iam: | ||
withOIDC: true | ||
availabilityZones: | ||
- us-west-1b | ||
- us-west-1c | ||
managedNodeGroups: | ||
- name: mgmt-ng-1 | ||
amiFamily: Bottlerocket | ||
desiredCapacity: 1 | ||
minSize: 1 | ||
maxSize: 2 | ||
labels: { role: management } | ||
tags: | ||
nodegroup-role: management | ||
instanceSelector: | ||
cpuArchitecture: arm64 | ||
vCPUs: 8 | ||
addons: | ||
- name: eks-pod-identity-agent |
Oops, something went wrong.