-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
1,587 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
checkpoints | ||
slurm-*.out |
227 changes: 227 additions & 0 deletions
227
3.test_cases/XX.transformer-engine/0.transformer-engine.dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
FROM nvcr.io/nvidia/pytorch:24.04-py3 | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# The three must-be-built packages. | ||
# Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error. | ||
ENV EFA_INSTALLER_VERSION=1.30.0 | ||
ENV AWS_OFI_NCCL_VERSION=1.8.1-aws | ||
ENV NCCL_TESTS_VERSION=master | ||
|
||
## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and | ||
# nccl>=2.19.0. See https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/efa-cheatsheet.md | ||
#ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0 | ||
|
||
RUN apt-get update -y | ||
RUN apt-get remove -y --allow-change-held-packages \ | ||
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 | ||
|
||
# We noticed that since 23.09, we can't just delete the whole /opt/hpcx/, otherwise `import torch` | ||
# complains about missing libuc?.so. | ||
RUN rm -rf /opt/hpcx/ompi \ | ||
&& rm -rf /usr/local/mpi \ | ||
&& rm -rf /opt/hpcx/nccl_rdma_sharp_plugin \ | ||
&& ldconfig | ||
ENV OPAL_PREFIX= | ||
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ | ||
git \ | ||
gcc \ | ||
vim \ | ||
kmod \ | ||
openssh-client \ | ||
openssh-server \ | ||
build-essential \ | ||
curl \ | ||
autoconf \ | ||
libtool \ | ||
gdb \ | ||
automake \ | ||
cmake \ | ||
apt-utils \ | ||
libhwloc-dev \ | ||
aptitude && \ | ||
DEBIAN_FRONTEND=noninteractive apt autoremove -y | ||
|
||
# EFA | ||
RUN apt-get update && \ | ||
cd /tmp && \ | ||
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ | ||
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ | ||
cd aws-efa-installer && \ | ||
# ONLY add `--skip-kmod`, `--no-verify` and `--skip-limit-conf` flags to container image. | ||
# Those three flags must NOT be used on the host. | ||
# | ||
# Explanations: | ||
# - to build EFA in the Dockerfile, we added --skip-kmod and --no-verify. Without these flags, | ||
# the Dockerfile will fail to build. If installing EFA on the host and not in a container, | ||
# please remove these flags. | ||
# - The --skip-limit-conf can be retained in Dockerfile, but it's redundant as the host already | ||
# has these limits set by efa_installer. | ||
./efa_installer.sh -y -g -d --skip-kmod --no-verify --skip-limit-conf && \ | ||
ldconfig && \ | ||
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* | ||
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH | ||
ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH | ||
|
||
|
||
#################################################################################################### | ||
# [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official | ||
# binaries. | ||
# | ||
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the | ||
# aws-ofi-ccnl. | ||
#################################################################################################### | ||
#ENV NCCL_VERSION=2.19.3-1 | ||
#RUN cd /opt && \ | ||
# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ | ||
# dpkg -i cuda-keyring_1.0-1_all.deb && \ | ||
# apt update && \ | ||
# apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \ | ||
# echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf | ||
|
||
|
||
#################################################################################################### | ||
# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The | ||
# benefits of installing to the same location as the built-in version are: | ||
# | ||
# 1. There's only ever a single libnccl version offered by this image, preventing application from | ||
# mistakenly chooses a wrong version. | ||
# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD. | ||
# | ||
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the | ||
# aws-ofi-ccnl. | ||
#################################################################################################### | ||
ENV NCCL_VERSION=2.19.3-1 | ||
RUN apt-get remove -y libnccl2 libnccl-dev \ | ||
&& cd /tmp \ | ||
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ | ||
&& cd nccl \ | ||
&& make -j src.build BUILDDIR=/usr \ | ||
# Build for p4 & p5. | ||
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ | ||
&& rm -rf /tmp/nccl \ | ||
&& echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf | ||
|
||
|
||
#################################################################################################### | ||
# Rebuild OpenMPI with custom PMIX version. E.g., to match what host's Slurm is built with (see | ||
# /opt/pmix/ on host, or run pmix_info on host). | ||
# | ||
# May be needed on rare occassions when `srun --mpi=pmix --container-image=... <mpi_application>` | ||
# mysteriously crashes. | ||
# | ||
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the | ||
# aws-ofi-ccnl. | ||
#################################################################################################### | ||
ENV OPEN_MPI_PATH=/opt/amazon/openmpi | ||
|
||
# OpenMPI build script claims PMIX_VERSION, and complains if we use it. | ||
ENV CUSTOM_PMIX_VERSION=4.2.6 | ||
RUN apt-get update && apt-get install -y libevent-dev \ | ||
&& cd /tmp \ | ||
&& wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ | ||
&& tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ | ||
&& rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ | ||
&& cd pmix-${CUSTOM_PMIX_VERSION}/ \ | ||
&& ./autogen.pl \ | ||
&& ./configure --prefix=/opt/pmix \ | ||
&& make -j \ | ||
&& make install \ | ||
&& echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \ | ||
&& ldconfig \ | ||
&& cd / \ | ||
&& rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ | ||
# To silence this runtime error message: | ||
# [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168 | ||
ENV PMIX_GDS_MODULE=^ds12 \ | ||
PMIX_MCA_gds=^ds12 | ||
|
||
# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. | ||
ENV OMPI_VERSION=4.1.6 | ||
RUN rm -fr ${OPEN_MPI_PATH} \ | ||
&& mkdir /tmp/openmpi \ | ||
&& cd /tmp/openmpi \ | ||
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ | ||
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \ | ||
&& rm openmpi-${OMPI_VERSION}.tar.gz \ | ||
&& cd openmpi-${OMPI_VERSION} \ | ||
&& ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \ | ||
&& make -j $(nproc) all \ | ||
&& make install \ | ||
&& ldconfig \ | ||
&& cd / \ | ||
&& rm -rf /tmp/openmpi \ | ||
&& ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ | ||
# Verify pmix from /opt/pmix/ | ||
&& ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt | ||
#################################################################################################### | ||
|
||
|
||
# NCCL EFA Plugin | ||
RUN mkdir -p /tmp && \ | ||
cd /tmp && \ | ||
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ | ||
cd /tmp/aws-ofi-nccl && \ | ||
./autogen.sh && \ | ||
./configure --prefix=/opt/amazon/efa \ | ||
--with-libfabric=/opt/amazon/efa \ | ||
--with-cuda=/usr/local/cuda \ | ||
--enable-platform-aws \ | ||
--with-mpi=/opt/amazon/openmpi && \ | ||
make -j$(nproc) install && \ | ||
rm -rf /tmp/aws-ofi/nccl | ||
|
||
# Do this to minimize the ld path env vars that users need to define when running this image. | ||
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ | ||
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ | ||
ldconfig | ||
|
||
ENV OMPI_MCA_pml=^cm,ucx \ | ||
OMPI_MCA_btl=tcp,self \ | ||
OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ | ||
OPAL_PREFIX=/opt/amazon/openmpi \ | ||
# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 | ||
# https://github.com/pytorch/pytorch/issues/68893 | ||
NCCL_SOCKET_IFNAME=^docker,lo | ||
|
||
ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" | ||
|
||
# NCCL-tests: always good to include this as a diagnostic tool. | ||
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ | ||
&& cd /opt/nccl-tests \ | ||
&& git checkout ${NCCL_TESTS_VERSION} \ | ||
&& make MPI=1 \ | ||
MPI_HOME=/opt/amazon/openmpi \ | ||
CUDA_HOME=/usr/local/cuda \ | ||
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" | ||
|
||
|
||
#################################################################################################### | ||
# Custom packages. Disable as you like. NOTE: always check `pip list` what's been installed. For | ||
# example, the base container comes pre-installed with Transformer Engine, flash attention, triton | ||
# (https://github.com/openai/triton/), etc. | ||
#################################################################################################### | ||
# Install the xformers dependency from source, because pip install either breaks or try to pull | ||
# its own pt + cuda. | ||
# | ||
# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza. | ||
RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \ | ||
# On p4de.24xlarge: | ||
# - MAX_JOBS=16 => 145GB memory | ||
# - MAX_JOBS=32 => 241GB memory | ||
# - MAX_JOBS=48 => 243GB memory, 542.5s | ||
# | ||
# NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to | ||
# work to prevent OOM. | ||
export MAX_JOBS=32 && \ | ||
export NVCC_PREPEND_FLAGS="-t 32" && \ | ||
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers | ||
|
||
RUN pip install transformers datasets | ||
|
||
WORKDIR "/fsx" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/bin/bash | ||
|
||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#SBATCH --nodes=2 # number of nodes to use | ||
#SBATCH --job-name=LlamaFP8 # name of your job | ||
#SBATCH --exclusive # job has exclusive use of the resource, no sharing | ||
|
||
set -ex; | ||
|
||
########################### | ||
###### User Variables ##### | ||
########################### | ||
|
||
GPUS_PER_NODE=8 # 4 for G5.12x, 8 for P4/P5 | ||
|
||
########################### | ||
## Environment Variables ## | ||
########################### | ||
|
||
## Plenty of EFA level variables | ||
## Comment out for non-efa instances (G4d, P3) | ||
## For G5.12x, Comment out RDMA and Fork safe | ||
## For G4dn and other G5, comment out all | ||
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d | ||
export FI_EFA_FORK_SAFE=1 | ||
export FI_LOG_LEVEL=1 | ||
export FI_PROVIDER=efa | ||
export NCCL_DEBUG=INFO | ||
## Switching SYNC_MEMOPS to zero can boost throughput with FSDP | ||
## Disables CU_POINTER_ATTRIBUTE_SYNC_MEMOPS | ||
## Reduces memory synchronizations | ||
## https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html | ||
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 | ||
|
||
# default variables for Enroot | ||
: "${IMAGE:=$(pwd)/transformer-engine.sqsh}" | ||
: "${DATA_PATH:=/fsx}" | ||
: "${FSX_MOUNT:=$(pwd):$DATA_PATH}" | ||
|
||
declare -a ARGS=( | ||
--container-image $IMAGE | ||
--container-mounts $FSX_MOUNT | ||
) | ||
|
||
########################### | ||
####### Torch Dist ####### | ||
########################### | ||
|
||
declare -a TORCHRUN_ARGS=( | ||
--nproc_per_node=$GPUS_PER_NODE | ||
--nnodes=$SLURM_JOB_NUM_NODES | ||
--rdzv_id=$SLURM_JOB_ID | ||
--rdzv_backend=c10d | ||
--rdzv_endpoint=$(hostname) | ||
) | ||
|
||
export TORCHRUN=torchrun | ||
export TRAIN_SCRIPT=./train.py | ||
|
||
############################ | ||
# Llama 2 Training Params ## | ||
############################ | ||
|
||
declare -a TRAINING_ARGS=( | ||
--max_context_width=4096 | ||
--num_key_value_heads=32 # 7b: 32 13b: 40 70b: 8 | ||
--intermediate_size=11008 # 7b: 11008 13b: 13824 70b: 28672 | ||
--hidden_width=4096 # 7b: 4096 13b: 5120 70b: 8192 | ||
--num_layers=32 # 7b: 32 13b: 40 70b: 80 | ||
--num_heads=32 # 7b: 32 13b: 40 70b: 64 | ||
--model_type=llama_v2 | ||
--tokenizer="hf-internal-testing/llama-tokenizer" | ||
--checkpoint_freq=5000 | ||
--validation_freq=100 | ||
--max_steps=5000 | ||
--checkpoint_dir=./checkpoints | ||
--dataset='c4' | ||
--dataset_config_name='en' | ||
--resume_from_checkpoint=./checkpoints | ||
--train_batch_size=1 | ||
--val_batch_size=1 | ||
--sharding_strategy="full" # https://pytorch.org/docs/stable/fsdp.html | ||
--offload_activations=1 | ||
--fp8=1 | ||
) | ||
|
||
AUTO_RESUME="" | ||
if [ -d "/opt/sagemaker_cluster" ]; then | ||
echo "Detected Hyperpod cluster.. enabling --auto-resume=1" | ||
AUTO_RESUME="--auto-resume=1" | ||
fi | ||
|
||
srun ${AUTO_RESUME} -l "${ARGS[@]}" torchrun "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}" |
Oops, something went wrong.