Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 47 additions & 15 deletions .github/workflows/cuda13.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,21 @@ env:
jobs:
build-cuda13:
name: Build CUDA 13
name: Build CUDA 13 wheel for ${{ matrix.arch }}
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-cpu
- tag=cuda13-build-cuda13
- runner=${{ matrix.runner }}
- tag=cuda13-build-cuda13-${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
include:
- arch: aarch64
runner: linux-arm64-cpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
- arch: x86_64
runner: linux-amd64-cpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand All @@ -29,20 +39,31 @@ jobs:
- name: Log into Docker registry (AWS ECR)
run: bash ops/pipeline/login-docker-registry.sh
- run: |
bash ops/pipeline/build-cuda13.sh
bash ops/pipeline/build-cuda13.sh ${{ matrix.image_repo }} ${{ matrix.arch }}
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda13 \
--prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
build/testxgboost python-package/dist/*.whl
test-cpp-cuda13:
name: Google Test (C++) with CUDA 13
name: Google Test (C++) with CUDA 13, arch ${{ matrix.arch }}
needs: [build-cuda13]
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-gpu
- tag=cuda13-test-cpp-cuda13
- runner=${{ matrix.runner }}
- tag=cuda13-test-cpp-cuda13-${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
include:
- arch: aarch64
runner: linux-arm64-gpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
- arch: x86_64
runner: linux-amd64-gpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand All @@ -55,19 +76,30 @@ jobs:
run: |
python3 ops/pipeline/manage-artifacts.py download \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda13 \
--prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
--dest-dir build \
testxgboost
chmod +x build/testxgboost
- run: |
bash ops/pipeline/test-cpp-cuda13.sh
bash ops/pipeline/test-cpp-cuda13.sh ${{ matrix.image_repo }}
test-python-cuda13:
name: Run Python tests with CUDA 13
name: Run Python tests with CUDA 13, arch ${{ matrix.arch }}
needs: [build-cuda13]
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-gpu
- tag=cuda13-test-python-cuda13
- runner=${{ matrix.runner }}
- tag=cuda13-test-python-cuda13-${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
include:
- arch: x86_64
runner: linux-amd64-gpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
- arch: aarch64
runner: linux-arm64-gpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand All @@ -80,8 +112,8 @@ jobs:
run: |
python3 ops/pipeline/manage-artifacts.py download \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda13 \
--prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
--dest-dir wheelhouse \
*.whl
- name: Run Python tests
run: bash ops/pipeline/test-python-wheel-cuda13.sh
run: bash ops/pipeline/test-python-wheel-cuda13.sh ${{ matrix.image_repo }}
72 changes: 36 additions & 36 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,21 @@ jobs:
- run: bash ops/pipeline/build-cpu.sh

build-cuda:
name: Build CUDA + manylinux_2_28_x86_64 wheel
name: Build CUDA + manylinux_2_28_${{ matrix.arch }} wheel
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-cpu
- tag=main-build-cuda
- runner=${{ matrix.runner }}
- tag=main-build-cuda-${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
include:
- arch: aarch64
runner: linux-arm64-cpu
image_repo: xgb-ci.gpu_build_rockylinux8_aarch64
- arch: x86_64
runner: linux-amd64-cpu
image_repo: xgb-ci.gpu_build_rockylinux8
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand All @@ -45,12 +55,12 @@ jobs:
- name: Log into Docker registry (AWS ECR)
run: bash ops/pipeline/login-docker-registry.sh
- run: |
bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 disable-rmm
bash ops/pipeline/build-cuda.sh ${{ matrix.image_repo }} ${{ matrix.arch }} disable-rmm
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda \
--prefix cache/${{ github.run_id }}/build-cuda-${{ matrix.arch }} \
build/testxgboost python-package/dist/*.whl

build-cuda-with-rmm:
Expand All @@ -68,36 +78,14 @@ jobs:
- name: Log into Docker registry (AWS ECR)
run: bash ops/pipeline/login-docker-registry.sh
- run: |
bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 enable-rmm
bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 x86_64 enable-rmm
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda-with-rmm \
build/testxgboost

build-python-wheels-arm64:
name: Build manylinux_2_28_aarch64 wheel
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-arm64-cpu
- tag=build-python-wheels-arm64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
- uses: actions/checkout@v4
with:
submodules: "true"
- name: Log into Docker registry (AWS ECR)
run: bash ops/pipeline/login-docker-registry.sh
- run: bash ops/pipeline/build-python-wheels-arm64.sh
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-python-wheels-arm64 \
python-package/dist/*.whl

build-python-wheels-cpu:
name: Build CPU wheel for ${{ matrix.manylinux_target }}_${{ matrix.arch }}
runs-on:
Expand Down Expand Up @@ -166,13 +154,20 @@ jobs:
include:
- suite: gpu
runner: linux-amd64-gpu
artifact_from: build-cuda
image_repo: xgb-ci.gpu
artifact_from: build-cuda-x86_64
- suite: gpu
runner: linux-arm64-gpu
image_repo: xgb-ci.gpu_aarch64
artifact_from: build-cuda-aarch64
- suite: gpu-rmm
runner: linux-amd64-gpu
image_repo: xgb-ci.gpu
artifact_from: build-cuda-with-rmm
- suite: mgpu
runner: linux-amd64-mgpu
artifact_from: build-cuda
image_repo: xgb-ci.gpu
artifact_from: build-cuda-x86_64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand All @@ -189,11 +184,11 @@ jobs:
--dest-dir build \
testxgboost
chmod +x build/testxgboost
- run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }}
- run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.image_repo }} ${{ matrix.suite }}

test-python-wheel:
name: Run Python tests (${{ matrix.description }})
needs: [build-cuda, build-python-wheels-arm64]
needs: [build-cuda]
runs-on:
- runs-on
- runner=${{ matrix.runner }}
Expand All @@ -208,22 +203,27 @@ jobs:
image_repo: xgb-ci.gpu
suite: gpu
runner: linux-amd64-gpu
artifact_from: build-cuda
artifact_from: build-cuda-x86_64
- description: multiple-gpu
image_repo: xgb-ci.gpu
suite: mgpu
runner: linux-amd64-mgpu
artifact_from: build-cuda
artifact_from: build-cuda-x86_64
- description: cpu-amd64
image_repo: xgb-ci.cpu
suite: cpu
runner: linux-amd64-cpu
artifact_from: build-cuda
artifact_from: build-cuda-x86_64
- description: cpu-arm64
image_repo: xgb-ci.manylinux_2_28_aarch64
suite: cpu-arm64
runner: linux-arm64-cpu
artifact_from: build-python-wheels-arm64
artifact_from: build-cuda-aarch64
- description: gpu-arm64
image_repo: xgb-ci.gpu_aarch64
suite: gpu-arm64
runner: linux-arm64-gpu
artifact_from: build-cuda-aarch64
steps:
# Restart Docker daemon so that it recognizes the ephemeral disks
- run: sudo systemctl restart docker
Expand Down
21 changes: 21 additions & 0 deletions doc/contrib/ci.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,15 @@ Examples: useful tasks for local development
--image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8:main \
-- ops/pipeline/build-cuda-impl.sh

* Build XGBoost with GPU support on Linux ARM64

.. code-block:: bash

export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
python3 ops/docker_run.py \
--image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8_aarch64:main \
-- ops/pipeline/build-cuda-impl.sh

* Run Python tests

.. code-block:: bash
Expand All @@ -217,6 +226,16 @@ Examples: useful tasks for local development
--use-gpus \
-- ops/pipeline/test-python-wheel-impl.sh gpu

* Run Python tests with GPU algorithm on Linux ARM64

.. code-block:: bash

export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
python3 ops/docker_run.py \
--image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_aarch64:main \
--use-gpus \
-- ops/pipeline/test-python-wheel-impl.sh gpu-arm64

* Run Python tests with GPU algorithm, with multiple GPUs

.. code-block:: bash
Expand Down Expand Up @@ -287,6 +306,8 @@ To opt into self-hosted runners (enabled by RunsOn), we use the following specia
- tag=[unique tag that uniquely identifies the job in the GH Action workflow]

where the runner is defined in ``.github/runs-on.yml``.
For CUDA-enabled ARM64 builds and tests we rely on the ``linux-arm64-gpu`` runner,
which provisions a Graviton + NVIDIA GPU instance.

===================================================================
The Lay of the Land: how CI pipelines are organized in the codebase
Expand Down
7 changes: 6 additions & 1 deletion doc/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Capabilities of binary wheels for each platform:
+=====================+=========+======================+
| Linux x86_64 | |tick| | |tick| |
+---------------------+---------+----------------------+
| Linux aarch64 | |cross| | |cross| |
| Linux aarch64 | |tick| | |cross| |
+---------------------+---------+----------------------+
| MacOS x86_64 | |cross| | |cross| |
+---------------------+---------+----------------------+
Expand All @@ -76,6 +76,11 @@ Capabilities of binary wheels for each platform:
| Windows | |tick| | |cross| |
+---------------------+---------+----------------------+

Linux aarch64 wheels now ship with CUDA support, so ``pip install xgboost`` on
modern Jetson or Graviton machines provides the same GPU functionality as the
Linux x86_64 wheel. Multi-node and multi-GPU training remain experimental on
ARM64 at this time.

Minimal installation (CPU-only)
*******************************
The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning.
Expand Down
39 changes: 24 additions & 15 deletions ops/pipeline/build-cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ then
exit 1
fi

if [[ "$#" -lt 2 ]]
if [[ "$#" -lt 3 ]]
then
echo "Usage: $0 [image_repo] {enable-rmm,disable-rmm}"
echo "Usage: $0 [image_repo] {x86_64,aarch64} {enable-rmm,disable-rmm}"
exit 2
fi
image_repo="$1"
rmm_flag="$2"
arch="$2"
rmm_flag="$3"
export USE_FEDERATED=1

# Validate RMM flag
Expand All @@ -36,7 +37,7 @@ source ops/pipeline/classify-git-branch.sh
source ops/pipeline/get-docker-registry-details.sh
source ops/pipeline/get-image-tag.sh

WHEEL_TAG=manylinux_2_28_x86_64
WHEEL_TAG=manylinux_2_28_${arch}
BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}"
MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"

Expand Down Expand Up @@ -74,13 +75,17 @@ pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl

if [[ $USE_RMM == 0 ]]
then
# Generate the meta info which includes xgboost version and the commit info
echo "--- Generate meta info"
python3 ops/script/format_wheel_meta.py \
--wheel-path python-package/dist/*.whl \
--commit-hash ${GITHUB_SHA} \
--platform-tag ${WHEEL_TAG} \
--meta-path python-package/dist/
if [[ $arch == "x86_64" ]]
then
# Generate the meta info which includes xgboost version and the commit info
# TODO(hcho3): Generate meta.json that contains both x86_64 and aarch64 wheels
echo "--- Generate meta info"
python3 ops/script/format_wheel_meta.py \
--wheel-path python-package/dist/*.whl \
--commit-hash ${GITHUB_SHA} \
--platform-tag ${WHEEL_TAG} \
--meta-path python-package/dist/
fi

echo "--- Upload Python wheel"
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
Expand All @@ -89,9 +94,13 @@ then
--s3-bucket xgboost-nightly-builds \
--prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
python-package/dist/*.whl
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket xgboost-nightly-builds \
--prefix ${BRANCH_NAME} --make-public \
python-package/dist/meta.json

if [[ $arch == "x86_64" ]]
then
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket xgboost-nightly-builds \
--prefix ${BRANCH_NAME} --make-public \
python-package/dist/meta.json
fi
fi
fi
Loading
Loading