Skip to content

Commit

Permalink
feat(ci): add package generation
Browse files Browse the repository at this point in the history
stack-info: PR: aws#592, branch: aws-nslick/stack/33
  • Loading branch information
aws-nslick committed Sep 22, 2024
1 parent ba9a401 commit 3f8ec64
Show file tree
Hide file tree
Showing 24 changed files with 1,047 additions and 4 deletions.
Empty file added .docker/cfg/use-cluster.hcl
Empty file.
Empty file added .docker/cfg/use-github.hcl
Empty file.
22 changes: 22 additions & 0 deletions .docker/cfg/use-local.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#
# What is this?
# This overrides defaults for caching, targets, tags, platforms, etc. This
# file is expected to be symlinked into the root of the tree to select the
# builder used by `docker buildx bake'.
#
# This file configures local builds.
#
# See https://docs.docker.com/build/bake/reference/#file-format

variable "VERSION" { default = "master" }

target "efainstaller" {
platforms = [ "linux/amd64", "linux/arm64" ]
context = "."
dockerfile = ".docker/containers/Dockerfile.efa"
output = ["type=cacheonly"]
}
49 changes: 49 additions & 0 deletions .docker/containers/Dockerfile.cache_efa
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM alpine:latest AS extractor
RUN apk add --no-cache tar

FROM extractor AS extracted
ARG INSTALLER_PREFIX
ENV INSTALLER_PREFIX=${INSTALLER_PREFIX}
COPY --from=efa_installer_tarball aws-efa-installer*.tar.gz .
RUN tar xvf *.tar.gz \
--wildcards "aws-efa-installer/*.txt" \
--wildcards "aws-efa-installer/*.sh" \
--wildcards "aws-efa-installer/**/${INSTALLER_PREFIX}/$(uname -m)" && \
rm -rf *.tar.gz

FROM distro_image AS installed
ARG ENABLE_EFA_INSTALLER_DEBUG_INFO=0
ARG ENABLE_MPI4=0
ARG ENABLE_MPI5=0

ENV ENABLE_EFA_INSTALLER_DEBUG_INFO=${ENABLE_EFA_INSTALLER_DEBUG_INFO}
ENV ENABLE_MPI4=${ENABLE_MPI4}
ENV ENABLE_MPI5=${ENABLE_MPI5}

RUN mkdir /aws-efa-installer
COPY --from=extracted /aws-efa-installer /aws-efa-installer

# XXX: the EFA installer script should refresh the package caches itself.
# XXX: the EFA installer depends on util-linux, which many contianers don't have.
RUN ( ! command -v getopt && ( apt install -y util-linux || \
dnf -y install util-linux || \
yum install -y util-linux ) || /bin/true) && \
((command -v apt-get && apt-get update -y) || /bin/true ) && \
((command -v yum && yum update -y ) || /bin/true ) && \
((command -v dnf && dnf -y update ) || /bin/true ) && \
cd /aws-efa-installer && \
./efa_installer.sh -y -n -l -k -g \
$(test "$ENABLE_EFA_INSTALLER_DEBUG_INFO" -eq "1" && echo "-d") \
$(test "$ENABLE_MPI4" -eq "1" && echo "--mpi openmpi4") \
$(test "$ENABLE_MPI5" -eq "1" && echo "--mpi openmpi5") && \
cd && rm -rf /aws-efa-installer && \
((command -v apt-get && apt-get purge -y && apt-get clean -y) || /bin/true ) && \
((command -v dnf && dnf clean all -y) || /bin/true ) && \
((command -v yum && yum clean all -y) || /bin/true ) && \
((command -v zypper && zypper clean ) || /bin/true )
44 changes: 44 additions & 0 deletions .docker/containers/Dockerfile.dnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=fedora
ARG VERSION=rawhide
ARG VARIANT=cuda
ARG CUDA_DISTRO
ARG AWS_BUILD
ARG ENABLE_POWERTOOLS

# Install EFA-installer deps.
FROM ${FAMILY}:${VERSION} AS builder
ARG CUDA_DISTRO
ARG ENABLE_POWERTOOLS
ENV CUDA_DISTRO=${CUDA_DISTRO}
ENV ENABLE_POWERTOOLS=${ENABLE_POWERTOOLS}
# Add NVIDIA repo for CUDA builds.
COPY --from=efainstaller / /
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
bash -c "cd /aws-efa-installer && dnf install -y gcc rpmdevtools rpmlint dnf-plugins-core util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \
( test "${ENABLE_POWERTOOLS}" = "1" && sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo || /bin/true ) && \
dnf -y update && dnf -y upgrade
RUN rpmdev-setuptree

FROM builder AS environment
ARG VARIANT
ARG AWS_BUILD
ENV VARIANT=${VARIANT}
ENV AWS_BUILD=${AWS_BUILD}
COPY --from=srpm . .
RUN yum search hwloc
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
dnf -y install cuda-cudart-devel-12-6 && dnf -y builddep *.src.rpm && rpmbuild --rebuild *.src.rpm

FROM scratch
COPY --from=environment /root/rpmbuild/RPMS/**/* /
65 changes: 65 additions & 0 deletions .docker/containers/Dockerfile.dpkg
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=ubuntu
ARG VERSION=latest
ARG CUDA_DISTRO
ARG DEBIAN_FRONTEND=noninteractive
ARG AWS_BUILD

FROM ${FAMILY}:${VERSION} AS build
ARG CUDA_DISTRO
ENV CUDA_DISTRO=${CUDA_DISTRO}
ARG AWS_BUILD=0
ENV AWS_BUILD=${AWS_BUILD}

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y && apt-get install wget -y

RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
dpkg -i cuda-keyring_1.1-1_all.deb

COPY --from=efainstaller / .
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
bash -c "apt-get update -y && cd /aws-efa-installer && ./efa_installer.sh /efa_installer.sh -n -l -k -d -y && apt-get install -y autoconf automake libtool gcc g++ git libhwloc-dev make && rm -rf /aws-efa-installer"

COPY --from=makedist / .
RUN tar xvf ./aws-ofi-nccl*.tar.gz -C .
RUN cd aws-ofi-nccl* && \
./configure --$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") \
--prefix=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \
--with-libfabric=/opt/amazon/efa \
--disable-tests \
--$(test "$AWS_BUILD" -eq 0 && echo -n "disable" || echo -n "enable")-platform-aws \
--with-mpi=no && make -j && make install

FROM ubuntu:latest AS packager
ARG FAMILY
ARG VERSION
ARG AWS_BUILD=0
ENV AWS_BUILD=${AWS_BUILD}
ENV FAMILY=${FAMILY}
ENV VERSION=${VERSION}
COPY --from=build /opt/amazon/ /opt/amazon/
RUN find /opt/amazon/ | grep -E \.la$ | xargs rm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y && apt-get install -y ruby tar squashfs-tools binutils && gem install fpm
RUN fpm \
-s dir -t deb \
--license Apache2.0 \
-p /libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")-${FAMILY}-${VERSION}.deb \
--name nccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \
/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")/=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")

FROM scratch
COPY --from=packager /libnccl-net-ofi* /

11 changes: 11 additions & 0 deletions .docker/containers/Dockerfile.dpkg_add_cuda_repo
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM alpine:latest AS downloader
ARG CUDA_DISTRO
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb

FROM base_image
ARG CUDA_TOOLKIT_VERSION_SUFFIX
ENV CUDA_TOOLKIT_VERSION_SUFFIX=${CUDA_TOOLKIT_VERSION_SUFFIX}
COPY --from=downloader cuda-keyring*.deb .
RUN apt-get update -y && apt-get install -y ca-certificates && dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring*.deb && \
apt-get update -y && apt-get install -y cuda-cudart-dev-${CUDA_TOOLKIT_VERSION_SUFFIX} && \
apt-get purge -y && apt-get clean -y
17 changes: 17 additions & 0 deletions .docker/containers/Dockerfile.install_efa
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM distro_image
RUN mkdir /aws-efa-installer
COPY --from=efa_installer_contents /aws-efa-installer /aws-efa-installer

# XXX: EFA installer doesn't refresh the package caches if they're unpopulated,
# as they always are in container images.
#
# XXX: EFA installer depends on util-linux, which many contianers don't have.
RUN (command -v getopt || apt install -y util-linux 2>/dev/null || \
dnf -y install util-linux 2>/dev/null || yum -y install util-linux 2>/dev/null) && \
(command -v apt-get && apt-get update -y || /bin/true ) && \
(! command -v yum || yum update -y ) && \
cd /aws-efa-installer && \
./efa_installer.sh -d -y -n -l -k -g --mpi openmpi4,openmpi5 && \
cd && rm -rf /aws-efa-installer && (command -v apt-get && apt-get purge -y && apt-get clean -y || /bin/true ) \
(command -v dnf && dnf clean -y || /bin/true ) \
(command -v yum && yum clean -y || /bin/true )
23 changes: 23 additions & 0 deletions .docker/containers/Dockerfile.makedist
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM base_image AS buildenv
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y && apt-get install -y automake autoconf libtool libhwloc-dev

FROM buildenv AS distbuilder
ARG ACCELERATOR
ENV ACCELERATOR=${ACCELERATOR}
COPY ../ /proj
WORKDIR /proj
RUN autoreconf -ivf && \
./configure --with-libfabric=/opt/amazon/efa \
--$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") && \
make -j dist

FROM scratch
COPY --from=distbuilder /proj/aws-ofi-nccl*.tar.gz /
18 changes: 18 additions & 0 deletions .docker/containers/Dockerfile.srpm
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM fedora:rawhide AS packitimg
RUN dnf install -y packit mock

FROM packitimg AS srpm
RUN mkdir /proj
WORKDIR /proj
COPY --from=src . .
COPY --from=makedist . .
RUN packit srpm

FROM scratch
COPY --from=srpm /proj/*.src.rpm /
42 changes: 42 additions & 0 deletions .docker/containers/Dockerfile.yum
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=amazonlinux
ARG VERSION=2
ARG VARIANT=cuda
ARG CUDA_DISTRO
ARG AWS_BUILD

# Install EFA-installer deps.
FROM ${FAMILY}:${VERSION} AS builder
ARG CUDA_DISTRO
ENV CUDA_DISTRO=${CUDA_DISTRO}
# Add NVIDIA repo for CUDA builds.
COPY --from=efainstaller / /
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
bash -c "cd /aws-efa-installer && yum install -y gcc rpmdevtools rpmlint yum-utils util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \
yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \
yum update -y
RUN rpmdev-setuptree

FROM builder AS environment
ARG VARIANT
ARG AWS_BUILD
ARG TOOLKIT_VERSION=12-6
ENV VARIANT=${VARIANT}
ENV AWS_BUILD=${AWS_BUILD}
ENV TOOLKIT_VERSION=${TOOLKIT_VERSION}
COPY --from=srpm . .
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros
RUN echo "%_cuda_toolkit_version ${TOOLKIT_VERSION}" >> ~/.rpmmacros
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
yum install -y cuda-cudart-devel-${TOOLKIT_VERSION} && yum-builddep -y *.src.rpm && rpmbuild --rebuild *.src.rpm

FROM scratch
COPY --from=environment /root/rpmbuild/RPMS/**/* /
32 changes: 32 additions & 0 deletions .docker/eks/cluster.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ect-build
region: us-west-1
version: "1.30"
tags:
karpenter.sh/discovery: ect-build
karpenter:
version: '1.0.2'
withSpotInterruptionQueue: true
createServiceAccount: true
iam:
withOIDC: true
availabilityZones:
- us-west-1b
- us-west-1c
managedNodeGroups:
- name: mgmt-ng-1
amiFamily: Bottlerocket
desiredCapacity: 1
minSize: 1
maxSize: 2
labels: { role: management }
tags:
nodegroup-role: management
instanceSelector:
cpuArchitecture: arm64
vCPUs: 8
addons:
- name: eks-pod-identity-agent
Loading

0 comments on commit 3f8ec64

Please sign in to comment.