dmlc · hcho3 · Dec 10, 2025 · Nov 20, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/.github/workflows/cuda13.yml b/.github/workflows/cuda13.yml
@@ -15,11 +15,21 @@ env:
 
 jobs:
   build-cuda13:
-    name: Build CUDA 13
+    name: Build CUDA 13 wheel for ${{ matrix.arch }}
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-cpu
-      - tag=cuda13-build-cuda13
+      - runner=${{ matrix.runner }}
+      - tag=cuda13-build-cuda13-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - arch: aarch64
+          runner: linux-arm64-cpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
+        - arch: x86_64
+          runner: linux-amd64-cpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -29,20 +39,31 @@ jobs:
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
       - run: |
-          bash ops/pipeline/build-cuda13.sh
+          bash ops/pipeline/build-cuda13.sh ${{ matrix.image_repo }} ${{ matrix.arch }}
       - name: Stash files
         run: |
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
             build/testxgboost python-package/dist/*.whl
+
   test-cpp-cuda13:
-    name: Google Test (C++) with CUDA 13
+    name: Google Test (C++) with CUDA 13, arch ${{ matrix.arch }}
     needs: [build-cuda13]
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-gpu
-      - tag=cuda13-test-cpp-cuda13
+      - runner=${{ matrix.runner }}
+      - tag=cuda13-test-cpp-cuda13-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - arch: aarch64
+          runner: linux-arm64-gpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
+        - arch: x86_64
+          runner: linux-amd64-gpu
+          image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -55,19 +76,30 @@ jobs:
         run: |
           python3 ops/pipeline/manage-artifacts.py download \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
             --dest-dir build \
             testxgboost
           chmod +x build/testxgboost
       - run: |
-          bash ops/pipeline/test-cpp-cuda13.sh
+          bash ops/pipeline/test-cpp-cuda13.sh ${{ matrix.image_repo }}
+
   test-python-cuda13:
-    name: Run Python tests with CUDA 13
+    name: Run Python tests with CUDA 13, arch ${{ matrix.arch }}
     needs: [build-cuda13]
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-gpu
-      - tag=cuda13-test-python-cuda13
+      - runner=${{ matrix.runner }}
+      - tag=cuda13-test-python-cuda13-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: x86_64
+            runner: linux-amd64-gpu
+            image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
+          - arch: aarch64
+            runner: linux-arm64-gpu
+            image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -80,8 +112,8 @@ jobs:
         run: |
           python3 ops/pipeline/manage-artifacts.py download \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda13 \
+            --prefix cache/${{ github.run_id }}/build-cuda13-${{ matrix.arch }} \
             --dest-dir wheelhouse \
             *.whl
       - name: Run Python tests
-        run: bash ops/pipeline/test-python-wheel-cuda13.sh
+        run: bash ops/pipeline/test-python-wheel-cuda13.sh ${{ matrix.image_repo }}
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -31,11 +31,21 @@ jobs:
       - run: bash ops/pipeline/build-cpu.sh
 
   build-cuda:
-    name: Build CUDA + manylinux_2_28_x86_64 wheel
+    name: Build CUDA + manylinux_2_28_${{ matrix.arch }} wheel
     runs-on:
       - runs-on=${{ github.run_id }}
-      - runner=linux-amd64-cpu
-      - tag=main-build-cuda
+      - runner=${{ matrix.runner }}
+      - tag=main-build-cuda-${{ matrix.arch }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - arch: aarch64
+          runner: linux-arm64-cpu
+          image_repo: xgb-ci.gpu_build_rockylinux8_aarch64
+        - arch: x86_64
+          runner: linux-amd64-cpu
+          image_repo: xgb-ci.gpu_build_rockylinux8
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -45,12 +55,12 @@ jobs:
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
       - run: |
-          bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 disable-rmm
+          bash ops/pipeline/build-cuda.sh ${{ matrix.image_repo }} ${{ matrix.arch }} disable-rmm
       - name: Stash files
         run: |
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-cuda \
+            --prefix cache/${{ github.run_id }}/build-cuda-${{ matrix.arch }} \
             build/testxgboost python-package/dist/*.whl
 
   build-cuda-with-rmm:
@@ -68,36 +78,14 @@ jobs:
       - name: Log into Docker registry (AWS ECR)
         run: bash ops/pipeline/login-docker-registry.sh
       - run: |
-          bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 enable-rmm
+          bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 x86_64 enable-rmm
       - name: Stash files
         run: |
           python3 ops/pipeline/manage-artifacts.py upload \
             --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
             --prefix cache/${{ github.run_id }}/build-cuda-with-rmm \
             build/testxgboost
 
-  build-python-wheels-arm64:
-    name: Build manylinux_2_28_aarch64 wheel
-    runs-on:
-      - runs-on=${{ github.run_id }}
-      - runner=linux-arm64-cpu
-      - tag=build-python-wheels-arm64
-    steps:
-      # Restart Docker daemon so that it recognizes the ephemeral disks
-      - run: sudo systemctl restart docker
-      - uses: actions/checkout@v4
-        with:
-          submodules: "true"
-      - name: Log into Docker registry (AWS ECR)
-        run: bash ops/pipeline/login-docker-registry.sh
-      - run: bash ops/pipeline/build-python-wheels-arm64.sh
-      - name: Stash files
-        run: |
-          python3 ops/pipeline/manage-artifacts.py upload \
-            --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
-            --prefix cache/${{ github.run_id }}/build-python-wheels-arm64 \
-            python-package/dist/*.whl
-
   build-python-wheels-cpu:
     name: Build CPU wheel for ${{ matrix.manylinux_target }}_${{ matrix.arch }}
     runs-on:
@@ -166,13 +154,20 @@ jobs:
         include:
           - suite: gpu
             runner: linux-amd64-gpu
-            artifact_from: build-cuda
+            image_repo: xgb-ci.gpu
+            artifact_from: build-cuda-x86_64
+          - suite: gpu
+            runner: linux-arm64-gpu
+            image_repo: xgb-ci.gpu_aarch64
+            artifact_from: build-cuda-aarch64
           - suite: gpu-rmm
             runner: linux-amd64-gpu
+            image_repo: xgb-ci.gpu
             artifact_from: build-cuda-with-rmm
           - suite: mgpu
             runner: linux-amd64-mgpu
-            artifact_from: build-cuda
+            image_repo: xgb-ci.gpu
+            artifact_from: build-cuda-x86_64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker
@@ -189,11 +184,11 @@ jobs:
             --dest-dir build \
             testxgboost
           chmod +x build/testxgboost
-      - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }}
+      - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.image_repo }} ${{ matrix.suite }}
 
   test-python-wheel:
     name: Run Python tests (${{ matrix.description }})
-    needs: [build-cuda, build-python-wheels-arm64]
+    needs: [build-cuda]
     runs-on:
       - runs-on
       - runner=${{ matrix.runner }}
@@ -208,22 +203,27 @@ jobs:
             image_repo: xgb-ci.gpu
             suite: gpu
             runner: linux-amd64-gpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
           - description: multiple-gpu
             image_repo: xgb-ci.gpu
             suite: mgpu
             runner: linux-amd64-mgpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
           - description: cpu-amd64
             image_repo: xgb-ci.cpu
             suite: cpu
             runner: linux-amd64-cpu
-            artifact_from: build-cuda
+            artifact_from: build-cuda-x86_64
           - description: cpu-arm64
             image_repo: xgb-ci.manylinux_2_28_aarch64
             suite: cpu-arm64
             runner: linux-arm64-cpu
-            artifact_from: build-python-wheels-arm64
+            artifact_from: build-cuda-aarch64
+          - description: gpu-arm64
+            image_repo: xgb-ci.gpu_aarch64
+            suite: gpu-arm64
+            runner: linux-arm64-gpu
+            artifact_from: build-cuda-aarch64
     steps:
       # Restart Docker daemon so that it recognizes the ephemeral disks
       - run: sudo systemctl restart docker

diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
@@ -198,6 +198,15 @@ Examples: useful tasks for local development
       --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8:main \
       -- ops/pipeline/build-cuda-impl.sh
 
+* Build XGBoost with GPU support on Linux ARM64
+
+  .. code-block:: bash
+
+    export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
+    python3 ops/docker_run.py \
+      --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8_aarch64:main \
+      -- ops/pipeline/build-cuda-impl.sh
+
 * Run Python tests
 
   .. code-block:: bash
@@ -217,6 +226,16 @@ Examples: useful tasks for local development
       --use-gpus \
       -- ops/pipeline/test-python-wheel-impl.sh gpu
 
+* Run Python tests with GPU algorithm on Linux ARM64
+
+  .. code-block:: bash
+
+    export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com
+    python3 ops/docker_run.py \
+      --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_aarch64:main \
+      --use-gpus \
+      -- ops/pipeline/test-python-wheel-impl.sh gpu-arm64
+
 * Run Python tests with GPU algorithm, with multiple GPUs
 
   .. code-block:: bash
@@ -287,6 +306,8 @@ To opt into self-hosted runners (enabled by RunsOn), we use the following specia
     - tag=[unique tag that uniquely identifies the job in the GH Action workflow]
 
 where the runner is defined in ``.github/runs-on.yml``.
+For CUDA-enabled ARM64 builds and tests we rely on the ``linux-arm64-gpu`` runner,
+which provisions a Graviton + NVIDIA GPU instance.
 
 ===================================================================
 The Lay of the Land: how CI pipelines are organized in the codebase

diff --git a/doc/install.rst b/doc/install.rst
@@ -67,7 +67,7 @@ Capabilities of binary wheels for each platform:
 +=====================+=========+======================+
 | Linux x86_64        | |tick|  |  |tick|              |
 +---------------------+---------+----------------------+
-| Linux aarch64       | |cross| |  |cross|             |
+| Linux aarch64       | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 | MacOS x86_64        | |cross| |  |cross|             |
 +---------------------+---------+----------------------+
@@ -76,6 +76,11 @@ Capabilities of binary wheels for each platform:
 | Windows             | |tick|  |  |cross|             |
 +---------------------+---------+----------------------+
 
+Linux aarch64 wheels now ship with CUDA support, so ``pip install xgboost`` on
+modern Jetson or Graviton machines provides the same GPU functionality as the
+Linux x86_64 wheel. Multi-node and multi-GPU training remain experimental on
+ARM64 at this time.
+
 Minimal installation (CPU-only)
 *******************************
 The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning.

diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh
@@ -9,13 +9,14 @@ then
   exit 1
 fi
 
-if [[ "$#" -lt 2 ]]
+if [[ "$#" -lt 3 ]]
 then
-  echo "Usage: $0 [image_repo] {enable-rmm,disable-rmm}"
+  echo "Usage: $0 [image_repo] {x86_64,aarch64} {enable-rmm,disable-rmm}"
   exit 2
 fi
 image_repo="$1"
-rmm_flag="$2"
+arch="$2"
+rmm_flag="$3"
 export USE_FEDERATED=1
 
 # Validate RMM flag
@@ -36,7 +37,7 @@ source ops/pipeline/classify-git-branch.sh
 source ops/pipeline/get-docker-registry-details.sh
 source ops/pipeline/get-image-tag.sh
 
-WHEEL_TAG=manylinux_2_28_x86_64
+WHEEL_TAG=manylinux_2_28_${arch}
 BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}"
 MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}"
 
@@ -74,13 +75,17 @@ pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl
 
 if [[ $USE_RMM == 0 ]]
 then
-  # Generate the meta info which includes xgboost version and the commit info
-  echo "--- Generate meta info"
-  python3 ops/script/format_wheel_meta.py \
-    --wheel-path python-package/dist/*.whl  \
-    --commit-hash ${GITHUB_SHA}  \
-    --platform-tag ${WHEEL_TAG}  \
-    --meta-path python-package/dist/
+  if [[ $arch == "x86_64" ]]
+  then
+    # Generate the meta info which includes xgboost version and the commit info
+    # TODO(hcho3): Generate meta.json that contains both x86_64 and aarch64 wheels
+    echo "--- Generate meta info"
+    python3 ops/script/format_wheel_meta.py \
+      --wheel-path python-package/dist/*.whl  \
+      --commit-hash ${GITHUB_SHA}  \
+      --platform-tag ${WHEEL_TAG}  \
+      --meta-path python-package/dist/
+  fi
 
   echo "--- Upload Python wheel"
   if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
@@ -89,9 +94,13 @@ then
       --s3-bucket xgboost-nightly-builds \
       --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
       python-package/dist/*.whl
-    python3 ops/pipeline/manage-artifacts.py upload \
-      --s3-bucket xgboost-nightly-builds \
-      --prefix ${BRANCH_NAME} --make-public \
-      python-package/dist/meta.json
+
+    if [[ $arch == "x86_64" ]]
+    then
+      python3 ops/pipeline/manage-artifacts.py upload \
+        --s3-bucket xgboost-nightly-builds \
+        --prefix ${BRANCH_NAME} --make-public \
+        python-package/dist/meta.json
+    fi
   fi
 fi