From 00066e9321c64d9b74a3902c431fe5a15ec564c2 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Thu, 20 Nov 2025 11:33:48 -0600
Subject: [PATCH 1/9] FEA first commit for ARM CUDA wheels

---
 .github/runs-on.yml                      |  4 ++
 .github/workflows/cuda13.yml             | 43 ++++++++++++--
 .github/workflows/main.yml               | 29 ++++++++-
 doc/contrib/ci.rst                       | 21 +++++++
 doc/install.rst                          |  7 ++-
 ops/pipeline/build-cuda-arm64.sh         | 75 ++++++++++++++++++++++++
 ops/pipeline/build-cuda13.sh             | 18 +++++-
 ops/pipeline/test-python-wheel-cuda13.sh | 15 ++++-
 ops/pipeline/test-python-wheel-impl.sh   |  7 ++-
 ops/pipeline/test-python-wheel.sh        |  4 +-
 ops/script/release_artifacts.py          |  1 +
 11 files changed, 211 insertions(+), 13 deletions(-)
 create mode 100755 ops/pipeline/build-cuda-arm64.sh

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index 1d97b8c5de21..b005093420d8 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -34,6 +34,10 @@ runners:
     cpu: 16
     family: ["c6g", "c7g"]
     image: linux-arm64
+  linux-arm64-gpu:
+    family: ["g5g.xlarge"]
+    image: linux-arm64
+    spot: "false"
   windows-gpu:
     family: ["g4dn.2xlarge"]
     image: windows-amd64
diff --git a/.github/workflows/cuda13.yml b/.github/workflows/cuda13.yml
index 5ea448f25cce..b6f33291d178 100644
--- a/.github/workflows/cuda13.yml
+++ b/.github/workflows/cuda13.yml
@@ -36,6 +36,29 @@ jobs:
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/build-cuda13 \
             build/testxgboost python-package/dist/*.whl
+
+  build-cuda13-arm64:
+    name: Build CUDA 13 (ARM64)
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-arm64-cpu
+      - tag=cuda13-build-cuda13-arm64
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: |
+          bash ops/pipeline/build-cuda13.sh
+      - name: Stash files
+        run: |
+          python3 ops/pipeline/manage-artifacts.py upload \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-cuda13-arm64 \
+            python-package/dist/*.whl
   test-cpp-cuda13:
     name: Google Test (C++) with CUDA 13
     needs: [build-cuda13]
@@ -62,12 +85,22 @@ jobs:
       - run: |
           bash ops/pipeline/test-cpp-cuda13.sh
   test-python-cuda13:
-    name: Run Python tests with CUDA 13
-    needs: [build-cuda13]
+    name: Run Python tests with CUDA 13 (${{ matrix.description }})
+    needs: [build-cuda13, build-cuda13-arm64]
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-gpu
-      - tag=cuda13-test-python-cuda13
+      - runner=${{ matrix.runner }}
+      - tag=cuda13-test-python-cuda13-${{ matrix.description }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - description: amd64
+            runner: linux-amd64-gpu
+            artifact_from: build-cuda13
+          - description: arm64
+            runner: linux-arm64-gpu
+            artifact_from: build-cuda13-arm64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -80,7 +113,7 @@ jobs:
         run: |
           python3 ops/pipeline/manage-artifacts.py download \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
             --dest-dir wheelhouse \
             *.whl
       - name: Run Python tests
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 11fb4ff0a7df..5f7aa9aa95dd 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -94,6 +94,28 @@ jobs:
           bash ops/pipeline/build-cuda.sh \
             xgb-ci.gpu_build_rockylinux8_dev_ver enable-rmm
 
+  build-cuda-arm64:
+    name: Build CUDA + manylinux_2_28_aarch64 wheel
+    runs-on:
+      - runs-on=${{ github.run_id }}
+      - runner=linux-arm64-cpu
+      - tag=main-build-cuda-arm64
+    steps:
+      # Restart Docker daemon so that it recognizes the ephemeral disks
+      - run: sudo systemctl restart docker
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Log into Docker registry (AWS ECR)
+        run: bash ops/pipeline/login-docker-registry.sh
+      - run: bash ops/pipeline/build-cuda-arm64.sh
+      - name: Stash files
+        run: |
+          python3 ops/pipeline/manage-artifacts.py upload \
+            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
+            --prefix cache/${{ github.run_id }}/build-cuda-arm64 \
+            python-package/dist/*.whl
+
   build-python-wheels-arm64:
     name: Build manylinux_2_28_aarch64 wheel
     runs-on:
@@ -211,7 +233,7 @@ jobs:
 
   test-python-wheel:
     name: Run Python tests (${{ matrix.description }})
-    needs: [build-cuda, build-python-wheels-arm64]
+    needs: [build-cuda, build-cuda-arm64, build-python-wheels-arm64]
     runs-on:
       - runs-on
       - runner=${{ matrix.runner }}
@@ -242,6 +264,11 @@ jobs:
             suite: cpu-arm64
             runner: linux-arm64-cpu
             artifact_from: build-python-wheels-arm64
+          - description: gpu-arm64
+            image_repo: xgb-ci.gpu_aarch64
+            suite: gpu-arm64
+            runner: linux-arm64-gpu
+            artifact_from: build-cuda-arm64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
index 964d833e4756..1e9319b1593d 100644
--- a/doc/contrib/ci.rst
+++ b/doc/contrib/ci.rst
@@ -198,6 +198,15 @@ Examples: useful tasks for local development
       --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8:main \
       -- ops/pipeline/build-cuda-impl.sh
 
+* Build XGBoost with GPU support on Linux ARM64
+
+  .. code-block:: bash
+
+    export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
+    python3 ops/docker_run.py \
+      --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8_aarch64:main \
+      -- ops/pipeline/build-cuda-impl.sh
+
 * Run Python tests
 
   .. code-block:: bash
@@ -217,6 +226,16 @@ Examples: useful tasks for local development
       --use-gpus \
       -- ops/pipeline/test-python-wheel-impl.sh gpu
 
+* Run Python tests with GPU algorithm on Linux ARM64
+
+  .. code-block:: bash
+
+    export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
+    python3 ops/docker_run.py \
+      --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_aarch64:main \
+      --use-gpus \
+      -- ops/pipeline/test-python-wheel-impl.sh gpu-arm64
+
 * Run Python tests with GPU algorithm, with multiple GPUs
 
   .. code-block:: bash
@@ -287,6 +306,8 @@ To opt into self-hosted runners (enabled by RunsOn), we use the following specia
     - tag=[unique tag that uniquely identifies the job in the GH Action workflow]
 
 where the runner is defined in ``.github/runs-on.yml``.
+For CUDA-enabled ARM64 builds and tests we rely on the ``linux-arm64-gpu`` runner,
+which provisions a Graviton + NVIDIA GPU instance.
 
 ===================================================================
 The Lay of the Land: how CI pipelines are organized in the codebase
diff --git a/doc/install.rst b/doc/install.rst
index 7fcea0d3b68c..ea466c624acf 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -67,7 +67,7 @@ Capabilities of binary wheels for each platform:
 +=====================+=========+======================+
 | Linux x86_64        | |tick|  |  |tick|              |
 +---------------------+---------+----------------------+
-| Linux aarch64       | |cross| |  |cross|             |
+| Linux aarch64       | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 | MacOS x86_64        | |cross| |  |cross|             |
 +---------------------+---------+----------------------+
@@ -76,6 +76,11 @@ Capabilities of binary wheels for each platform:
 | Windows             | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 
+Linux aarch64 wheels now ship with CUDA support, so ``pip install xgboost`` on
+modern Jetson or Graviton machines provides the same GPU functionality as the
+Linux x86_64 wheel. Multi-node and multi-GPU training remain experimental on
+ARM64 at this time.
+
 Minimal installation (CPU-only)
 *******************************
 The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning.
diff --git a/ops/pipeline/build-cuda-arm64.sh b/ops/pipeline/build-cuda-arm64.sh
new file mode 100755
index 000000000000..f6078cba5298
--- /dev/null
+++ b/ops/pipeline/build-cuda-arm64.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+## Build XGBoost with CUDA for Linux ARM64
+
+set -euo pipefail
+
+if [[ -z "${GITHUB_SHA:-}" ]]
+then
+  echo "Make sure to set environment variable GITHUB_SHA"
+  exit 1
+fi
+
+IMAGE_REPO="xgb-ci.gpu_build_rockylinux8_aarch64"
+export USE_FEDERATED=1
+export USE_RMM=0
+
+source ops/pipeline/classify-git-branch.sh
+source ops/pipeline/get-docker-registry-details.sh
+source ops/pipeline/get-image-tag.sh
+
+WHEEL_TAG=manylinux_2_28_aarch64
+BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
+MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
+
+echo "--- Build with CUDA (ARM64)"
+
+if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
+then
+  export BUILD_ONLY_SM75=1
+else
+  export BUILD_ONLY_SM75=0
+fi
+
+set -x
+
+python3 ops/docker_run.py \
+  --image-uri ${BUILD_IMAGE_URI} \
+  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \
+  -- ops/pipeline/build-cuda-impl.sh
+
+echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
+python3 ops/docker_run.py \
+  --image-uri ${MANYLINUX_IMAGE_URI} \
+  -- auditwheel repair --only-plat \
+  --plat ${WHEEL_TAG} python-package/dist/*.whl
+python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
+  wheelhouse/*.whl
+mv -v wheelhouse/*.whl python-package/dist/
+if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then
+  echo "error: libgomp.so was not vendored in the wheel"
+  exit -1
+fi
+
+# Check size of wheel
+pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
+
+echo "--- Generate meta info"
+python3 ops/script/format_wheel_meta.py \
+  --wheel-path python-package/dist/*.whl  \
+  --commit-hash ${GITHUB_SHA}  \
+  --platform-tag ${WHEEL_TAG}  \
+  --meta-path python-package/dist/
+
+echo "--- Upload Python wheel"
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  python3 ops/pipeline/manage-artifacts.py upload \
+    --s3-bucket xgboost-nightly-builds \
+    --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
+    python-package/dist/*.whl
+  python3 ops/pipeline/manage-artifacts.py upload \
+    --s3-bucket xgboost-nightly-builds \
+    --prefix ${BRANCH_NAME} --make-public \
+    python-package/dist/meta.json
+fi
+
diff --git a/ops/pipeline/build-cuda13.sh b/ops/pipeline/build-cuda13.sh
index 8e24e8147b70..36caec14ae31 100755
--- a/ops/pipeline/build-cuda13.sh
+++ b/ops/pipeline/build-cuda13.sh
@@ -9,15 +9,29 @@ then
   exit 1
 fi
 
-IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
 export USE_RMM=0
 export USE_FEDERATED=0
 
+ARCH=$(uname -m)
+case "${ARCH}" in
+  x86_64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+    WHEEL_TAG=manylinux_2_28_x86_64
+    ;;
+  aarch64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
+    WHEEL_TAG=manylinux_2_28_aarch64
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}"
+    exit 1
+    ;;
+esac
+
 source ops/pipeline/classify-git-branch.sh
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-WHEEL_TAG=manylinux_2_28_x86_64
 BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
 MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
 
diff --git a/ops/pipeline/test-python-wheel-cuda13.sh b/ops/pipeline/test-python-wheel-cuda13.sh
index 279411779927..495fe5672aa5 100755
--- a/ops/pipeline/test-python-wheel-cuda13.sh
+++ b/ops/pipeline/test-python-wheel-cuda13.sh
@@ -6,7 +6,20 @@ set -euo pipefail
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+ARCH=$(uname -m)
+case "${ARCH}" in
+  x86_64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
+    ;;
+  aarch64)
+    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}"
+    exit 1
+    ;;
+esac
+
 IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
 
 set -x
diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh
index 5c24e31210d2..88270e85bf5a 100755
--- a/ops/pipeline/test-python-wheel-impl.sh
+++ b/ops/pipeline/test-python-wheel-impl.sh
@@ -13,7 +13,7 @@ suite="$1"
 
 # Cannot set -u before Conda env activation
 case "$suite" in
-  gpu|mgpu)
+  gpu|mgpu|gpu-arm64)
     source activate gpu_test
     ;;
   cpu)
@@ -42,6 +42,11 @@ case "$suite" in
     python -c 'from cupy.cuda import jitify; jitify._init_module()'
     pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu
     ;;
+  gpu-arm64)
+    echo "-- Run Python tests, using a single GPU (ARM64)"
+    python -c 'from cupy.cuda import jitify; jitify._init_module()'
+    pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu
+    ;;
   mgpu)
     echo "-- Run Python tests, using multiple GPUs"
     python -c 'from cupy.cuda import jitify; jitify._init_module()'
diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh
index 9ccdc42042d5..bc83504f2fba 100755
--- a/ops/pipeline/test-python-wheel.sh
+++ b/ops/pipeline/test-python-wheel.sh
@@ -5,14 +5,14 @@ set -euo pipefail
 
 if [[ "$#" -lt 2 ]]
 then
-  echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [image_repo]"
+  echo "Usage: $0 {gpu|mgpu|gpu-arm64|cpu|cpu-arm64} [image_repo]"
   exit 1
 fi
   
 suite="$1"
 image_repo="$2"
 
-if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]]
+if [[ "$suite" == "gpu" || "$suite" == "mgpu" || "$suite" == "gpu-arm64" ]]
 then
   gpu_option="--use-gpus"
 else
diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py
index ef05a71420ac..a26f5c3ba449 100644
--- a/ops/script/release_artifacts.py
+++ b/ops/script/release_artifacts.py
@@ -154,6 +154,7 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None:
     ]
     cu13_platforms = [
         "manylinux_2_28_x86_64",
+        "manylinux_2_28_aarch64",
     ]
     minimal_platforms = [
         "win_amd64",

From 0df024ff4ff48758edc478c01012625f4c9ab4b8 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 20:26:12 -0800
Subject: [PATCH 2/9] Enable building CUDA 12 wheel on ARM64

---
 .github/workflows/main.yml                | 57 ++++++++++-------------
 ops/pipeline/build-cuda.sh                | 39 ++++++++++------
 ops/pipeline/build-python-wheels-arm64.sh | 51 --------------------
 3 files changed, 48 insertions(+), 99 deletions(-)
 delete mode 100755 ops/pipeline/build-python-wheels-arm64.sh

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 19cb064e22d7..e44b29a10161 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,11 +31,21 @@ jobs:
       - run: bash ops/pipeline/build-cpu.sh
 
   build-cuda:
-    name: Build CUDA + manylinux_2_28_x86_64 wheel
+    name: Build CUDA + manylinux_2_28_${{ matrix.arch }} wheel
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-cpu
-      - tag=main-build-cuda
+      - runner=${{ matrix.runner }}
+      - tag=main-build-cuda-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - arch: aarch64
+          runner: linux-arm64-cpu
+          image_repo: xgb-ci.gpu_build_rockylinux8_aarch64
+        - arch: x86_64
+          runner: linux-amd64-cpu
+          image_repo: xgb-ci.gpu_build_rockylinux8
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -45,12 +55,12 @@ jobs:
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
       - run: |
-          bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 disable-rmm
+          bash ops/pipeline/build-cuda.sh ${{ matrix.image_repo }} ${{ matrix.arch }} disable-rmm
       - name: Stash files
         run: |
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda \
+            --prefix cache/${{ github.run_id }}/build-cuda-${{ matrix.arch }} \
             build/testxgboost python-package/dist/*.whl
 
   build-cuda-with-rmm:
@@ -76,28 +86,6 @@ jobs:
             --prefix cache/${{ github.run_id }}/build-cuda-with-rmm \
             build/testxgboost
 
-  build-python-wheels-arm64:
-    name: Build manylinux_2_28_aarch64 wheel
-    runs-on:
-      - runs-on=${{ github.run_id }}
-      - runner=linux-arm64-cpu
-      - tag=build-python-wheels-arm64
-    steps:
-      # Restart Docker daemon so that it recognizes the ephemeral disks
-      - run: sudo systemctl restart docker
-      - uses: actions/checkout@v4
-        with:
-          submodules: "true"
-      - name: Log into Docker registry (AWS ECR)
-        run: bash ops/pipeline/login-docker-registry.sh
-      - run: bash ops/pipeline/build-python-wheels-arm64.sh
-      - name: Stash files
-        run: |
-          python3 ops/pipeline/manage-artifacts.py upload \
-            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-python-wheels-arm64 \
-            python-package/dist/*.whl
-
   build-python-wheels-cpu:
     name: Build CPU wheel for ${{ matrix.manylinux_target }}_${{ matrix.arch }}
     runs-on:
@@ -166,7 +154,10 @@ jobs:
         include:
           - suite: gpu
             runner: linux-amd64-gpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
+          - suite: gpu
+            runner: linux-arm64-gpu
+            artifact_from: build-cuda-aarch64
           - suite: gpu-rmm
             runner: linux-amd64-gpu
             artifact_from: build-cuda-with-rmm
@@ -208,27 +199,27 @@ jobs:
             image_repo: xgb-ci.gpu
             suite: gpu
             runner: linux-amd64-gpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
           - description: multiple-gpu
             image_repo: xgb-ci.gpu
             suite: mgpu
             runner: linux-amd64-mgpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
           - description: cpu-amd64
             image_repo: xgb-ci.cpu
             suite: cpu
             runner: linux-amd64-cpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
           - description: cpu-arm64
             image_repo: xgb-ci.manylinux_2_28_aarch64
             suite: cpu-arm64
             runner: linux-arm64-cpu
-            artifact_from: build-python-wheels-arm64
+            artifact_from: build-cuda-aarch64
           - description: gpu-arm64
             image_repo: xgb-ci.gpu_aarch64
             suite: gpu-arm64
             runner: linux-arm64-gpu
-            artifact_from: build-cuda-arm64
+            artifact_from: build-cuda-aarch64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh
index 3458719bf090..08f135d85f0a 100755
--- a/ops/pipeline/build-cuda.sh
+++ b/ops/pipeline/build-cuda.sh
@@ -9,13 +9,14 @@ then
   exit 1
 fi
 
-if [[ "$#" -lt 2 ]]
+if [[ "$#" -lt 3 ]]
 then
-  echo "Usage: $0 [image_repo] {enable-rmm,disable-rmm}"
+  echo "Usage: $0 [image_repo] {x86_64,aarch64} {enable-rmm,disable-rmm}"
   exit 2
 fi
 image_repo="$1"
-rmm_flag="$2"
+arch="$2"
+rmm_flag="$3"
 export USE_FEDERATED=1
 
 # Validate RMM flag
@@ -36,7 +37,7 @@ source ops/pipeline/classify-git-branch.sh
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-WHEEL_TAG=manylinux_2_28_x86_64
+WHEEL_TAG=manylinux_2_28_${ARCH}
 BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}"
 MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
 
@@ -74,13 +75,17 @@ pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
 
 if [[ $USE_RMM == 0 ]]
 then
-  # Generate the meta info which includes xgboost version and the commit info
-  echo "--- Generate meta info"
-  python3 ops/script/format_wheel_meta.py \
-    --wheel-path python-package/dist/*.whl  \
-    --commit-hash ${GITHUB_SHA}  \
-    --platform-tag ${WHEEL_TAG}  \
-    --meta-path python-package/dist/
+  if [[ $ARCH == "x86_64" ]]
+  then
+    # Generate the meta info which includes xgboost version and the commit info
+    # TODO(hcho3): Generate meta.json that contains both x86_64 and aarch64 wheels
+    echo "--- Generate meta info"
+    python3 ops/script/format_wheel_meta.py \
+      --wheel-path python-package/dist/*.whl  \
+      --commit-hash ${GITHUB_SHA}  \
+      --platform-tag ${WHEEL_TAG}  \
+      --meta-path python-package/dist/
+  fi
 
   echo "--- Upload Python wheel"
   if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
@@ -89,9 +94,13 @@ then
       --s3-bucket xgboost-nightly-builds \
       --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
       python-package/dist/*.whl
-    python3 ops/pipeline/manage-artifacts.py upload \
-      --s3-bucket xgboost-nightly-builds \
-      --prefix ${BRANCH_NAME} --make-public \
-      python-package/dist/meta.json
+
+    if [[ $ARCH == "x86_64" ]]
+    then
+      python3 ops/pipeline/manage-artifacts.py upload \
+        --s3-bucket xgboost-nightly-builds \
+        --prefix ${BRANCH_NAME} --make-public \
+        python-package/dist/meta.json
+    fi
   fi
 fi
diff --git a/ops/pipeline/build-python-wheels-arm64.sh b/ops/pipeline/build-python-wheels-arm64.sh
deleted file mode 100755
index ff38ceee13de..000000000000
--- a/ops/pipeline/build-python-wheels-arm64.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-## Build and test XGBoost with ARM64 CPU (no GPU, no federated learning)
-
-set -euo pipefail
-
-if [[ -z "${GITHUB_SHA:-}" ]]
-then
-  echo "Make sure to set environment variable GITHUB_SHA"
-  exit 1
-fi
-
-source ops/pipeline/classify-git-branch.sh
-source ops/pipeline/get-docker-registry-details.sh
-source ops/pipeline/get-image-tag.sh
-
-WHEEL_TAG=manylinux_2_28_aarch64
-IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}
-
-echo "--- Build CPU code targeting ARM64"
-set -x
-
-python3 ops/script/pypi_variants.py --use-suffix=na --require-nccl-dep=na
-python3 ops/docker_run.py \
-  --image-uri ${IMAGE_URI} \
-  -- ops/pipeline/build-python-wheels-arm64-impl.sh
-
-echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
-python3 ops/docker_run.py \
-  --image-uri ${IMAGE_URI} \
-  -- auditwheel repair --only-plat \
-  --plat ${WHEEL_TAG} python-package/dist/*.whl
-python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
-  wheelhouse/*.whl
-mv -v wheelhouse/*.whl python-package/dist/
-
-if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then
-  echo "error: libgomp.so was not vendored in the wheel"
-  exit -1
-fi
-
-# Check size of wheel
-pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
-
-echo "--- Upload Python wheel"
-if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
-then
-  python3 ops/pipeline/manage-artifacts.py upload \
-    --s3-bucket xgboost-nightly-builds \
-    --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
-    python-package/dist/*.whl
-fi

From 5e711fcd657b8ab30c63ef0673587b6b9c1c2e62 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 20:29:41 -0800
Subject: [PATCH 3/9] fix

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index e44b29a10161..daf07bbb008e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -184,7 +184,7 @@ jobs:
 
   test-python-wheel:
     name: Run Python tests (${{ matrix.description }})
-    needs: [build-cuda, build-cuda-arm64, build-python-wheels-arm64]
+    needs: [build-cuda]
     runs-on:
       - runs-on
       - runner=${{ matrix.runner }}

From c4a2aadb9010314cbf35c47fa59d24d1f6ddc73a Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 20:31:28 -0800
Subject: [PATCH 4/9] typo in script

---
 ops/pipeline/build-cuda.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh
index 08f135d85f0a..d9aeb3e708e1 100755
--- a/ops/pipeline/build-cuda.sh
+++ b/ops/pipeline/build-cuda.sh
@@ -37,7 +37,7 @@ source ops/pipeline/classify-git-branch.sh
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-WHEEL_TAG=manylinux_2_28_${ARCH}
+WHEEL_TAG=manylinux_2_28_${arch}
 BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}"
 MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
 
@@ -75,7 +75,7 @@ pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
 
 if [[ $USE_RMM == 0 ]]
 then
-  if [[ $ARCH == "x86_64" ]]
+  if [[ $arch == "x86_64" ]]
   then
     # Generate the meta info which includes xgboost version and the commit info
     # TODO(hcho3): Generate meta.json that contains both x86_64 and aarch64 wheels
@@ -95,7 +95,7 @@ then
       --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
       python-package/dist/*.whl
 
-    if [[ $ARCH == "x86_64" ]]
+    if [[ $arch == "x86_64" ]]
     then
       python3 ops/pipeline/manage-artifacts.py upload \
         --s3-bucket xgboost-nightly-builds \

From 41cc339e28107a6d167aa139e1998661eb018a08 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 20:49:01 -0800
Subject: [PATCH 5/9] Use matrix to de-duplicate jobs for x86_64, aarch64

---
 .github/workflows/cuda13.yml             | 81 ++++++++++++------------
 .github/workflows/main.yml               |  2 +-
 ops/pipeline/build-cuda13.sh             | 24 +++----
 ops/pipeline/test-cpp-cuda13.sh          | 10 ++-
 ops/pipeline/test-python-wheel-cuda13.sh | 23 +++----
 5 files changed, 65 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/cuda13.yml b/.github/workflows/cuda13.yml
index b6f33291d178..57ca9059b284 100644
--- a/.github/workflows/cuda13.yml
+++ b/.github/workflows/cuda13.yml
@@ -15,11 +15,21 @@ env:
 
 jobs:
   build-cuda13:
-    name: Build CUDA 13
+    name: Build CUDA 13 wheel for ${{ matrix.arch }}
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-cpu
-      - tag=cuda13-build-cuda13
+      - runner=${{ matrix.runner }}
+      - tag=cuda13-build-cuda13-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - arch: aarch64
+          runner: linux-arm64-cpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
+        - arch: x86_64
+          runner: linux-amd64-cpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -29,43 +39,31 @@ jobs:
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
       - run: |
-          bash ops/pipeline/build-cuda13.sh
+          bash ops/pipeline/build-cuda13.sh ${{ matrix.image_repo }} ${{ matrix.arch }}
       - name: Stash files
         run: |
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
             build/testxgboost python-package/dist/*.whl
 
-  build-cuda13-arm64:
-    name: Build CUDA 13 (ARM64)
-    runs-on:
-      - runs-on=${{ github.run_id }}
-      - runner=linux-arm64-cpu
-      - tag=cuda13-build-cuda13-arm64
-    steps:
-      # Restart Docker daemon so that it recognizes the ephemeral disks
-      - run: sudo systemctl restart docker
-      - uses: actions/checkout@v4
-        with:
-          submodules: "true"
-      - name: Log into Docker registry (AWS ECR)
-        run: bash ops/pipeline/login-docker-registry.sh
-      - run: |
-          bash ops/pipeline/build-cuda13.sh
-      - name: Stash files
-        run: |
-          python3 ops/pipeline/manage-artifacts.py upload \
-            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13-arm64 \
-            python-package/dist/*.whl
   test-cpp-cuda13:
-    name: Google Test (C++) with CUDA 13
+    name: Google Test (C++) with CUDA 13, arch ${{ matrix.arch }}
     needs: [build-cuda13]
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-gpu
-      - tag=cuda13-test-cpp-cuda13
+      - runner=${{ matrix.runner }}
+      - tag=cuda13-test-cpp-cuda13-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - arch: aarch64
+          runner: linux-arm64-gpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
+        - arch: x86_64
+          runner: linux-amd64-gpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -78,29 +76,30 @@ jobs:
         run: |
           python3 ops/pipeline/manage-artifacts.py download \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
             --dest-dir build \
             testxgboost
           chmod +x build/testxgboost
       - run: |
-          bash ops/pipeline/test-cpp-cuda13.sh
+          bash ops/pipeline/test-cpp-cuda13.sh ${{ matrix.image_repo }}
+
   test-python-cuda13:
-    name: Run Python tests with CUDA 13 (${{ matrix.description }})
-    needs: [build-cuda13, build-cuda13-arm64]
+    name: Run Python tests with CUDA 13, arch ${{ matrix.arch }}
+    needs: [build-cuda13]
     runs-on:
       - runs-on=${{ github.run_id }}
       - runner=${{ matrix.runner }}
-      - tag=cuda13-test-python-cuda13-${{ matrix.description }}
+      - tag=cuda13-test-python-cuda13-${{ matrix.arch }}
     strategy:
       fail-fast: false
       matrix:
         include:
-          - description: amd64
+          - arch: x86_64
             runner: linux-amd64-gpu
-            artifact_from: build-cuda13
-          - description: arm64
+            image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
+          - arch: aarch64
             runner: linux-arm64-gpu
-            artifact_from: build-cuda13-arm64
+            image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -113,8 +112,8 @@ jobs:
         run: |
           python3 ops/pipeline/manage-artifacts.py download \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
+            --prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
             --dest-dir wheelhouse \
             *.whl
       - name: Run Python tests
-        run: bash ops/pipeline/test-python-wheel-cuda13.sh
+        run: bash ops/pipeline/test-python-wheel-cuda13.sh ${{ matrix.image_repo }}
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index daf07bbb008e..ec6e996ba581 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
       - run: |
-          bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 enable-rmm
+          bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 x86_64 enable-rmm
       - name: Stash files
         run: |
           python3 ops/pipeline/manage-artifacts.py upload \
diff --git a/ops/pipeline/build-cuda13.sh b/ops/pipeline/build-cuda13.sh
index 36caec14ae31..03049d06bba0 100755
--- a/ops/pipeline/build-cuda13.sh
+++ b/ops/pipeline/build-cuda13.sh
@@ -9,29 +9,21 @@ then
   exit 1
 fi
 
+if [[ "$#" -lt 2 ]]
+then
+  echo "Usage: $0 [image_repo] {x86_64,aarch64}"
+  exit 2
+fi
+image_repo="$1"
+arch="$2"
 export USE_RMM=0
 export USE_FEDERATED=0
 
-ARCH=$(uname -m)
-case "${ARCH}" in
-  x86_64)
-    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
-    WHEEL_TAG=manylinux_2_28_x86_64
-    ;;
-  aarch64)
-    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
-    WHEEL_TAG=manylinux_2_28_aarch64
-    ;;
-  *)
-    echo "Unsupported architecture: ${ARCH}"
-    exit 1
-    ;;
-esac
-
 source ops/pipeline/classify-git-branch.sh
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
+WHEEL_TAG=manylinux_2_28_${arch}
 BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
 MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
 
diff --git a/ops/pipeline/test-cpp-cuda13.sh b/ops/pipeline/test-cpp-cuda13.sh
index 2ccd7bea6abc..165ea1b1109e 100755
--- a/ops/pipeline/test-cpp-cuda13.sh
+++ b/ops/pipeline/test-cpp-cuda13.sh
@@ -2,11 +2,17 @@
 
 set -euox pipefail
 
+if [[ "$#" -lt 1 ]]
+then
+  echo "Usage: $0 [image_repo]"
+  exit 2
+fi
+image_repo="$1"
+
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-IMAGE_REPO='xgb-ci.gpu_build_cuda13_rockylinux8'
-IMAGE_URI=${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}
+IMAGE_URI=${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}
 
 echo "--- Run Google Tests, using a single GPU, CUDA 13"
 python3 ops/docker_run.py --image-uri ${IMAGE_URI} --use-gpus \
diff --git a/ops/pipeline/test-python-wheel-cuda13.sh b/ops/pipeline/test-python-wheel-cuda13.sh
index 495fe5672aa5..fd76515f8d05 100755
--- a/ops/pipeline/test-python-wheel-cuda13.sh
+++ b/ops/pipeline/test-python-wheel-cuda13.sh
@@ -3,24 +3,17 @@
 
 set -euo pipefail
 
+if [[ "$#" -lt 1 ]]
+then
+  echo "Usage: $0 [image_repo]"
+  exit 2
+fi
+image_repo="$1"
+
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-ARCH=$(uname -m)
-case "${ARCH}" in
-  x86_64)
-    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8"
-    ;;
-  aarch64)
-    IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64"
-    ;;
-  *)
-    echo "Unsupported architecture: ${ARCH}"
-    exit 1
-    ;;
-esac
-
-IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
+IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}"
 
 set -x
 python3 ops/docker_run.py --image-uri "${IMAGE_URI}" --use-gpus \

From 88646e58e4565cbd332d75533f5c5326ca6c1ede Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 20:50:26 -0800
Subject: [PATCH 6/9] Removed unused build-cuda-arm64.sh

---
 ops/pipeline/build-cuda-arm64.sh | 75 --------------------------------
 1 file changed, 75 deletions(-)
 delete mode 100755 ops/pipeline/build-cuda-arm64.sh

diff --git a/ops/pipeline/build-cuda-arm64.sh b/ops/pipeline/build-cuda-arm64.sh
deleted file mode 100755
index f6078cba5298..000000000000
--- a/ops/pipeline/build-cuda-arm64.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/bin/bash
-## Build XGBoost with CUDA for Linux ARM64
-
-set -euo pipefail
-
-if [[ -z "${GITHUB_SHA:-}" ]]
-then
-  echo "Make sure to set environment variable GITHUB_SHA"
-  exit 1
-fi
-
-IMAGE_REPO="xgb-ci.gpu_build_rockylinux8_aarch64"
-export USE_FEDERATED=1
-export USE_RMM=0
-
-source ops/pipeline/classify-git-branch.sh
-source ops/pipeline/get-docker-registry-details.sh
-source ops/pipeline/get-image-tag.sh
-
-WHEEL_TAG=manylinux_2_28_aarch64
-BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
-MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
-
-echo "--- Build with CUDA (ARM64)"
-
-if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
-then
-  export BUILD_ONLY_SM75=1
-else
-  export BUILD_ONLY_SM75=0
-fi
-
-set -x
-
-python3 ops/docker_run.py \
-  --image-uri ${BUILD_IMAGE_URI} \
-  --run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \
-  -- ops/pipeline/build-cuda-impl.sh
-
-echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard"
-python3 ops/docker_run.py \
-  --image-uri ${MANYLINUX_IMAGE_URI} \
-  -- auditwheel repair --only-plat \
-  --plat ${WHEEL_TAG} python-package/dist/*.whl
-python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \
-  wheelhouse/*.whl
-mv -v wheelhouse/*.whl python-package/dist/
-if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then
-  echo "error: libgomp.so was not vendored in the wheel"
-  exit -1
-fi
-
-# Check size of wheel
-pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
-
-echo "--- Generate meta info"
-python3 ops/script/format_wheel_meta.py \
-  --wheel-path python-package/dist/*.whl  \
-  --commit-hash ${GITHUB_SHA}  \
-  --platform-tag ${WHEEL_TAG}  \
-  --meta-path python-package/dist/
-
-echo "--- Upload Python wheel"
-if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
-then
-  python3 ops/pipeline/manage-artifacts.py upload \
-    --s3-bucket xgboost-nightly-builds \
-    --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
-    python-package/dist/*.whl
-  python3 ops/pipeline/manage-artifacts.py upload \
-    --s3-bucket xgboost-nightly-builds \
-    --prefix ${BRANCH_NAME} --make-public \
-    python-package/dist/meta.json
-fi
-

From 7ce0ce3b0a3ba8b4882a6eebf41a28057e080be7 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 20:51:53 -0800
Subject: [PATCH 7/9] typo

---
 ops/pipeline/build-cuda13.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ops/pipeline/build-cuda13.sh b/ops/pipeline/build-cuda13.sh
index 03049d06bba0..bd312b7c48b0 100755
--- a/ops/pipeline/build-cuda13.sh
+++ b/ops/pipeline/build-cuda13.sh
@@ -24,7 +24,7 @@ source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
 WHEEL_TAG=manylinux_2_28_${arch}
-BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}"
+BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}"
 MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
 
 echo "--- Build with CUDA"

From 9dfc5ef17d32872f6c59e13a0d82cef28e26ca6c Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 21:07:41 -0800
Subject: [PATCH 8/9] Use correct artifact for mgpu test

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ec6e996ba581..c7d40f2d52a9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -163,7 +163,7 @@ jobs:
             artifact_from: build-cuda-with-rmm
           - suite: mgpu
             runner: linux-amd64-mgpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker

From 7194274637f825253de782dcfe8be92978544f88 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 9 Dec 2025 21:12:19 -0800
Subject: [PATCH 9/9] Use correct Docker image for arm64 gtest

---
 .github/workflows/main.yml   | 6 +++++-
 ops/pipeline/test-cpp-gpu.sh | 9 +++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c7d40f2d52a9..7ba762aeea38 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -154,15 +154,19 @@ jobs:
         include:
           - suite: gpu
             runner: linux-amd64-gpu
+            image_repo: xgb-ci.gpu
             artifact_from: build-cuda-x86_64
           - suite: gpu
             runner: linux-arm64-gpu
+            image_repo: xgb-ci.gpu_aarch64
             artifact_from: build-cuda-aarch64
           - suite: gpu-rmm
             runner: linux-amd64-gpu
+            image_repo: xgb-ci.gpu
             artifact_from: build-cuda-with-rmm
           - suite: mgpu
             runner: linux-amd64-mgpu
+            image_repo: xgb-ci.gpu
             artifact_from: build-cuda-x86_64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
@@ -180,7 +184,7 @@ jobs:
             --dest-dir build \
             testxgboost
           chmod +x build/testxgboost
-      - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }}
+      - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.image_repo }} ${{ matrix.suite }}
 
   test-python-wheel:
     name: Run Python tests (${{ matrix.description }})
diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh
index 3f3992828cef..39810a11b0f9 100755
--- a/ops/pipeline/test-cpp-gpu.sh
+++ b/ops/pipeline/test-cpp-gpu.sh
@@ -2,17 +2,18 @@
 
 set -euox pipefail
 
-if [[ "$#" -lt 1 ]]
+if [[ "$#" -lt 2 ]]
 then
-  echo "Usage: $0 {gpu,gpu-rmm,mgpu}"
+  echo "Usage: $0 [image_repo] {gpu,gpu-rmm,mgpu}"
   exit 1
 fi
-suite=$1
+image_repo=$1
+suite=$2
 
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.gpu:${IMAGE_TAG}
+IMAGE_URI=${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}
 
 case "${suite}" in
   gpu)