From e81b3219ce517ccb292f25784ac8545dfb6f62ba Mon Sep 17 00:00:00 2001 From: andrew Date: Thu, 8 Jan 2026 07:35:54 -0800 Subject: [PATCH] feat(ci): wanda ray image builds, uploads to Dockerhub Adds Dockerfile and Wanda files for Ray images, pulling artifacts from previous steps. Buildkite steps also updated to use these, including both build and upload steps. Topic: ray-image Signed-off-by: andrew --- .buildkite/build.rayci.yml | 322 +++++++++++++++--- ci/docker/ray-extra-image-cpu.wanda.yaml | 13 + ci/docker/ray-extra-image-cuda.wanda.yaml | 14 + ci/docker/ray-image-cpu.wanda.yaml | 13 + ci/docker/ray-image-cuda.wanda.yaml | 14 + ci/docker/ray-image.Dockerfile | 48 +++ ci/docker/ray-llm-extra-image-cuda.wanda.yaml | 14 + ci/docker/ray-llm-image-cuda.wanda.yaml | 14 + ci/ray_ci/automation/BUILD.bazel | 26 ++ ci/ray_ci/automation/push_ray_image.py | 308 +++++++++++++++++ ci/ray_ci/automation/test_push_ray_image.py | 309 +++++++++++++++++ 11 files changed, 1048 insertions(+), 47 deletions(-) create mode 100644 ci/docker/ray-extra-image-cpu.wanda.yaml create mode 100644 ci/docker/ray-extra-image-cuda.wanda.yaml create mode 100644 ci/docker/ray-image-cpu.wanda.yaml create mode 100644 ci/docker/ray-image-cuda.wanda.yaml create mode 100644 ci/docker/ray-image.Dockerfile create mode 100644 ci/docker/ray-llm-extra-image-cuda.wanda.yaml create mode 100644 ci/docker/ray-llm-image-cuda.wanda.yaml create mode 100644 ci/ray_ci/automation/push_ray_image.py create mode 100644 ci/ray_ci/automation/test_push_ray_image.py diff --git a/.buildkite/build.rayci.yml b/.buildkite/build.rayci.yml index 6452cc41e27c..f7b686e4726a 100644 --- a/.buildkite/build.rayci.yml +++ b/.buildkite/build.rayci.yml @@ -107,7 +107,6 @@ steps: - ray-java-build - ray-dashboard-build - # Upload cpp wheel to S3 - label: ":s3: upload: cpp wheel (x86_64)" key: linux_cpp_wheels_upload instance_type: small @@ -136,89 +135,317 @@ steps: depends_on: manylinux-x86_64 job_env: manylinux-x86_64 - - label: ":tapioca: build: ray py{{matrix}} docker (x86_64)" - key: ray_images + - name: ray-image-cpu-build + label: "wanda: ray py{{matrix}} cpu (x86_64)" + wanda: ci/docker/ray-image-cpu.wanda.yaml + matrix: + - "3.10" + - "3.11" + - "3.12" + env: + PYTHON_VERSION: "{{matrix}}" + ARCH_SUFFIX: "" + RAY_VERSION: "3.0.0.dev0" tags: - python_dependencies - docker - oss - instance_type: medium - commands: - - bazel run //ci/ray_ci:build_in_docker -- docker --python-version {{matrix}} - --platform cu11.7.1-cudnn8 --platform cu11.8.0-cudnn8 - --platform cu12.1.1-cudnn8 --platform cu12.3.2-cudnn9 - --platform cu12.4.1-cudnn --platform cu12.5.1-cudnn - --platform cu12.6.3-cudnn --platform cu12.8.1-cudnn - --platform cu12.9.1-cudnn - --platform cpu - --image-type ray --upload depends_on: - - manylinux-x86_64 - - forge - - raycudabase + - ray-wheel-build - raycpubase + + - label: ":docker: push: ray py{{matrix}} cpu (x86_64)" + key: ray_images_cpu_push + instance_type: small + commands: + - bazel run //.buildkite:copy_files -- --destination docker_login + - bazel run //ci/ray_ci/automation:push_ray_image -- + --python-version {{matrix}} + --platform cpu + --image-type ray + --architecture x86_64 + --upload matrix: - "3.10" - "3.11" - "3.12" + depends_on: + - ray-image-cpu-build + tags: + - python_dependencies + - docker + - skip-on-premerge + - oss - - label: ":tapioca: build: ray-extra py{{matrix}} docker (x86_64)" - key: ray_extra_images + - name: ray-image-cuda-build + label: "wanda: ray py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + wanda: ci/docker/ray-image-cuda.wanda.yaml + matrix: + setup: + python: + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1-cudnn8" + - "11.8.0-cudnn8" + - "12.1.1-cudnn8" + - "12.3.2-cudnn9" + - "12.4.1-cudnn" + - "12.5.1-cudnn" + - "12.6.3-cudnn" + - "12.8.1-cudnn" + - "12.9.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + ARCH_SUFFIX: "" + RAY_VERSION: "3.0.0.dev0" tags: - python_dependencies - docker - oss - instance_type: medium + depends_on: + - ray-wheel-build + - raycudabase + + - label: ":docker: push: ray py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + key: ray_images_cuda_push + instance_type: small commands: - - bazel run //ci/ray_ci:build_in_docker -- docker --python-version {{matrix}} - --platform cu11.7.1-cudnn8 --platform cu11.8.0-cudnn8 - --platform cu12.1.1-cudnn8 --platform cu12.3.2-cudnn9 - --platform cu12.4.1-cudnn --platform cu12.5.1-cudnn - --platform cu12.6.3-cudnn --platform cu12.8.1-cudnn - --platform cu12.9.1-cudnn - --platform cpu - --image-type ray-extra --upload + - bazel run //.buildkite:copy_files -- --destination docker_login + - bazel run //ci/ray_ci/automation:push_ray_image -- + --python-version {{matrix.python}} + --platform cu{{matrix.cuda}} + --image-type ray + --architecture x86_64 + --upload + matrix: + setup: + python: + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1-cudnn8" + - "11.8.0-cudnn8" + - "12.1.1-cudnn8" + - "12.3.2-cudnn9" + - "12.4.1-cudnn" + - "12.5.1-cudnn" + - "12.6.3-cudnn" + - "12.8.1-cudnn" + - "12.9.1-cudnn" depends_on: - - manylinux-x86_64 - - forge - - raycpubaseextra - - raycudabaseextra + - ray-image-cuda-build + tags: + - python_dependencies + - docker + - skip-on-premerge + - oss + + - name: ray-extra-image-cpu-build + label: "wanda: ray-extra py{{matrix}} cpu (x86_64)" + wanda: ci/docker/ray-extra-image-cpu.wanda.yaml matrix: - "3.10" - "3.11" - "3.12" + env: + PYTHON_VERSION: "{{matrix}}" + ARCH_SUFFIX: "" + RAY_VERSION: "3.0.0.dev0" + tags: + - python_dependencies + - docker + - oss + depends_on: + - ray-wheel-build + - raycpubaseextra - - label: ":tapioca: build: ray-llm py{{matrix}} docker (x86_64)" + - name: ray-extra-image-cuda-build + label: "wanda: ray-extra py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + wanda: ci/docker/ray-extra-image-cuda.wanda.yaml + matrix: + setup: + python: + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1-cudnn8" + - "11.8.0-cudnn8" + - "12.1.1-cudnn8" + - "12.3.2-cudnn9" + - "12.4.1-cudnn" + - "12.5.1-cudnn" + - "12.6.3-cudnn" + - "12.8.1-cudnn" + - "12.9.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + ARCH_SUFFIX: "" + RAY_VERSION: "3.0.0.dev0" tags: - python_dependencies - docker - oss - instance_type: medium - commands: - - bazel run //ci/ray_ci:build_in_docker -- docker --python-version {{matrix}} - --platform cu12.8.1-cudnn --image-type ray-llm --upload depends_on: - - manylinux-x86_64 - - forge + - ray-wheel-build + - raycudabaseextra + + - name: ray-llm-image-cuda-build + label: "wanda: ray-llm py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + wanda: ci/docker/ray-llm-image-cuda.wanda.yaml + matrix: + setup: + python: + - "3.11" + cuda: + - "12.8.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + ARCH_SUFFIX: "" + RAY_VERSION: "3.0.0.dev0" + tags: + - python_dependencies + - docker + - oss + depends_on: + - ray-wheel-build - ray-llmbase + + - name: ray-llm-extra-image-cuda-build + label: "wanda: ray-llm-extra py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + wanda: ci/docker/ray-llm-extra-image-cuda.wanda.yaml + matrix: + setup: + python: + - "3.11" + cuda: + - "12.8.1-cudnn" + env: + PYTHON_VERSION: "{{matrix.python}}" + CUDA_VERSION: "{{matrix.cuda}}" + ARCH_SUFFIX: "" + RAY_VERSION: "3.0.0.dev0" + tags: + - python_dependencies + - docker + - oss + depends_on: + - ray-wheel-build + - ray-llmbaseextra + + - label: ":docker: push: ray-extra py{{matrix}} cpu (x86_64)" + key: ray_extra_images_cpu_push + instance_type: small + commands: + - bazel run //.buildkite:copy_files -- --destination docker_login + - bazel run //ci/ray_ci/automation:push_ray_image -- + --python-version {{matrix}} + --platform cpu + --image-type ray-extra + --architecture x86_64 + --upload matrix: + - "3.10" - "3.11" + - "3.12" + depends_on: + - ray-extra-image-cpu-build + tags: + - python_dependencies + - docker + - skip-on-premerge + - oss - - label: ":tapioca: build: ray-llm-extra py{{matrix}} docker (x86_64)" + - label: ":docker: push: ray-extra py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + key: ray_extra_images_cuda_push + instance_type: small + commands: + - bazel run //.buildkite:copy_files -- --destination docker_login + - bazel run //ci/ray_ci/automation:push_ray_image -- + --python-version {{matrix.python}} + --platform cu{{matrix.cuda}} + --image-type ray-extra + --architecture x86_64 + --upload + matrix: + setup: + python: + - "3.10" + - "3.11" + - "3.12" + cuda: + - "11.7.1-cudnn8" + - "11.8.0-cudnn8" + - "12.1.1-cudnn8" + - "12.3.2-cudnn9" + - "12.4.1-cudnn" + - "12.5.1-cudnn" + - "12.6.3-cudnn" + - "12.8.1-cudnn" + - "12.9.1-cudnn" + depends_on: + - ray-extra-image-cuda-build tags: - python_dependencies - docker + - skip-on-premerge - oss - instance_type: medium + + - label: ":docker: push: ray-llm py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + key: ray_llm_images_cuda_push + instance_type: small commands: - - bazel run //ci/ray_ci:build_in_docker -- docker --python-version {{matrix}} - --platform cu12.8.1-cudnn --image-type ray-llm-extra --upload + - bazel run //.buildkite:copy_files -- --destination docker_login + - bazel run //ci/ray_ci/automation:push_ray_image -- + --python-version {{matrix.python}} + --platform cu{{matrix.cuda}} + --image-type ray-llm + --architecture x86_64 + --upload + matrix: + setup: + python: + - "3.11" + cuda: + - "12.8.1-cudnn" depends_on: - - manylinux-x86_64 - - forge - - ray-llmbaseextra + - ray-llm-image-cuda-build + tags: + - python_dependencies + - docker + - skip-on-premerge + - oss + + - label: ":docker: push: ray-llm-extra py{{matrix.python}} cu{{matrix.cuda}} (x86_64)" + key: ray_llm_extra_images_cuda_push + instance_type: small + commands: + - bazel run //.buildkite:copy_files -- --destination docker_login + - bazel run //ci/ray_ci/automation:push_ray_image -- + --python-version {{matrix.python}} + --platform cu{{matrix.cuda}} + --image-type ray-llm-extra + --architecture x86_64 + --upload matrix: - - "3.11" + setup: + python: + - "3.11" + cuda: + - "12.8.1-cudnn" + depends_on: + - ray-llm-extra-image-cuda-build + tags: + - python_dependencies + - docker + - skip-on-premerge + - oss - label: ":tapioca: smoke test build-docker.sh" tags: @@ -245,6 +472,7 @@ steps: - bazel run .buildkite:copy_files -- --destination docker_login - bazel run //ci/ray_ci/automation:generate_index -- --prefix nightly depends_on: - - ray_images + - ray_images_cpu_push + - ray_images_cuda_push - ray_images_aarch64 - forge diff --git a/ci/docker/ray-extra-image-cpu.wanda.yaml b/ci/docker/ray-extra-image-cpu.wanda.yaml new file mode 100644 index 000000000000..f3f319ec6041 --- /dev/null +++ b/ci/docker/ray-extra-image-cpu.wanda.yaml @@ -0,0 +1,13 @@ +name: "ray-extra-py$PYTHON_VERSION-cpu$ARCH_SUFFIX" +disable_caching: true +froms: + - "cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base-extra$ARCH_SUFFIX" # CPU base-extra image + - "cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX" # Ray wheel +dockerfile: ci/docker/ray-image.Dockerfile +build_args: + - PYTHON_VERSION + - ARCH_SUFFIX + - BASE_IMAGE=cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base-extra$ARCH_SUFFIX + - RAY_WHEEL_IMAGE=cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX + - RAY_COMMIT=$BUILDKITE_COMMIT + - RAY_VERSION diff --git a/ci/docker/ray-extra-image-cuda.wanda.yaml b/ci/docker/ray-extra-image-cuda.wanda.yaml new file mode 100644 index 000000000000..54c6bc2cb44a --- /dev/null +++ b/ci/docker/ray-extra-image-cuda.wanda.yaml @@ -0,0 +1,14 @@ +name: "ray-extra-py$PYTHON_VERSION-cu$CUDA_VERSION$ARCH_SUFFIX" +disable_caching: true +froms: + - "cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra$ARCH_SUFFIX" # CUDA base-extra image + - "cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX" # Ray wheel +dockerfile: ci/docker/ray-image.Dockerfile +build_args: + - PYTHON_VERSION + - CUDA_VERSION + - ARCH_SUFFIX + - BASE_IMAGE=cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra$ARCH_SUFFIX + - RAY_WHEEL_IMAGE=cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX + - RAY_COMMIT=$BUILDKITE_COMMIT + - RAY_VERSION diff --git a/ci/docker/ray-image-cpu.wanda.yaml b/ci/docker/ray-image-cpu.wanda.yaml new file mode 100644 index 000000000000..9285bd5dbbff --- /dev/null +++ b/ci/docker/ray-image-cpu.wanda.yaml @@ -0,0 +1,13 @@ +name: "ray-py$PYTHON_VERSION-cpu$ARCH_SUFFIX" +disable_caching: true +froms: + - "cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base$ARCH_SUFFIX" # CPU base image with Python + deps + - "cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX" # Ray wheel +dockerfile: ci/docker/ray-image.Dockerfile +build_args: + - PYTHON_VERSION + - ARCH_SUFFIX + - BASE_IMAGE=cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base$ARCH_SUFFIX + - RAY_WHEEL_IMAGE=cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX + - RAY_COMMIT=$BUILDKITE_COMMIT + - RAY_VERSION diff --git a/ci/docker/ray-image-cuda.wanda.yaml b/ci/docker/ray-image-cuda.wanda.yaml new file mode 100644 index 000000000000..ef1328c9ef11 --- /dev/null +++ b/ci/docker/ray-image-cuda.wanda.yaml @@ -0,0 +1,14 @@ +name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION$ARCH_SUFFIX" +disable_caching: true +froms: + - "cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base$ARCH_SUFFIX" # CUDA base image + - "cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX" # Ray wheel +dockerfile: ci/docker/ray-image.Dockerfile +build_args: + - PYTHON_VERSION + - CUDA_VERSION + - ARCH_SUFFIX + - BASE_IMAGE=cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base$ARCH_SUFFIX + - RAY_WHEEL_IMAGE=cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX + - RAY_COMMIT=$BUILDKITE_COMMIT + - RAY_VERSION diff --git a/ci/docker/ray-image.Dockerfile b/ci/docker/ray-image.Dockerfile new file mode 100644 index 000000000000..3dc69da2b91c --- /dev/null +++ b/ci/docker/ray-image.Dockerfile @@ -0,0 +1,48 @@ +# syntax=docker/dockerfile:1.3-labs +# +# Ray Image Builder +# ================= +# Installs the Ray wheel into a base image (CPU or CUDA), includes +# pip freeze output for reproducibility. +# +ARG BASE_IMAGE +ARG RAY_WHEEL_IMAGE + +FROM ${RAY_WHEEL_IMAGE} AS wheel-source +FROM ${BASE_IMAGE} + +ARG PYTHON_VERSION=3.10 +ARG RAY_COMMIT=unknown-commit +ARG RAY_VERSION=3.0.0.dev0 + +LABEL io.ray.ray-commit="${RAY_COMMIT}" +LABEL io.ray.ray-version="${RAY_VERSION}" + +COPY --from=wheel-source /*.whl /home/ray/ + +# Install Ray wheel with all extras +# Uses requirements_compiled.txt from base image (already at /home/ray/) +RUN <&2 + ls -l /home/ray/*.whl >&2 + exit 1 +fi +WHEEL_FILE="${WHEEL_FILES[0]}" + +echo "Installing wheel: $WHEEL_FILE" + +$HOME/anaconda3/bin/pip --no-cache-dir install \ + -c /home/ray/requirements_compiled.txt \ + "${WHEEL_FILE}[all]" + +$HOME/anaconda3/bin/pip freeze > /home/ray/pip-freeze.txt + +echo "Ray version: $($HOME/anaconda3/bin/python -c 'import ray; print(ray.__version__)')" +EOF + +CMD ["/bin/bash"] diff --git a/ci/docker/ray-llm-extra-image-cuda.wanda.yaml b/ci/docker/ray-llm-extra-image-cuda.wanda.yaml new file mode 100644 index 000000000000..7e44a0954299 --- /dev/null +++ b/ci/docker/ray-llm-extra-image-cuda.wanda.yaml @@ -0,0 +1,14 @@ +name: "ray-llm-extra-py$PYTHON_VERSION-cu$CUDA_VERSION$ARCH_SUFFIX" +disable_caching: true +froms: + - "cr.ray.io/rayproject/ray-llm-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra$ARCH_SUFFIX" # LLM base-extra image + - "cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX" # Ray wheel +dockerfile: ci/docker/ray-image.Dockerfile +build_args: + - PYTHON_VERSION + - CUDA_VERSION + - ARCH_SUFFIX + - BASE_IMAGE=cr.ray.io/rayproject/ray-llm-py$PYTHON_VERSION-cu$CUDA_VERSION-base-extra$ARCH_SUFFIX + - RAY_WHEEL_IMAGE=cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX + - RAY_COMMIT=$BUILDKITE_COMMIT + - RAY_VERSION diff --git a/ci/docker/ray-llm-image-cuda.wanda.yaml b/ci/docker/ray-llm-image-cuda.wanda.yaml new file mode 100644 index 000000000000..aff90c3e393a --- /dev/null +++ b/ci/docker/ray-llm-image-cuda.wanda.yaml @@ -0,0 +1,14 @@ +name: "ray-llm-py$PYTHON_VERSION-cu$CUDA_VERSION$ARCH_SUFFIX" +disable_caching: true +froms: + - "cr.ray.io/rayproject/ray-llm-py$PYTHON_VERSION-cu$CUDA_VERSION-base$ARCH_SUFFIX" # LLM base image + - "cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX" # Ray wheel +dockerfile: ci/docker/ray-image.Dockerfile +build_args: + - PYTHON_VERSION + - CUDA_VERSION + - ARCH_SUFFIX + - BASE_IMAGE=cr.ray.io/rayproject/ray-llm-py$PYTHON_VERSION-cu$CUDA_VERSION-base$ARCH_SUFFIX + - RAY_WHEEL_IMAGE=cr.ray.io/rayproject/ray-wheel-py$PYTHON_VERSION$ARCH_SUFFIX + - RAY_COMMIT=$BUILDKITE_COMMIT + - RAY_VERSION diff --git a/ci/ray_ci/automation/BUILD.bazel b/ci/ray_ci/automation/BUILD.bazel index 000fd14b9c8a..05db7fa08894 100644 --- a/ci/ray_ci/automation/BUILD.bazel +++ b/ci/ray_ci/automation/BUILD.bazel @@ -316,3 +316,29 @@ py_binary( ci_require("click"), ], ) + +py_binary( + name = "push_ray_image", + srcs = ["push_ray_image.py"], + exec_compatible_with = ["//bazel:py3"], + deps = [ + ":crane_lib", + "//ci/ray_ci:ray_ci_lib", + ci_require("click"), + ], +) + +py_test( + name = "test_push_ray_image", + size = "small", + srcs = ["test_push_ray_image.py"], + exec_compatible_with = ["//bazel:py3"], + tags = [ + "ci_unit", + "team:ci", + ], + deps = [ + ":push_ray_image", + ci_require("pytest"), + ], +) diff --git a/ci/ray_ci/automation/push_ray_image.py b/ci/ray_ci/automation/push_ray_image.py new file mode 100644 index 000000000000..4715cb4145d8 --- /dev/null +++ b/ci/ray_ci/automation/push_ray_image.py @@ -0,0 +1,308 @@ +import logging +import sys +from datetime import datetime +from typing import List + +import click + +from ci.ray_ci.automation.crane_lib import ( + call_crane_copy, + call_crane_manifest, +) +from ci.ray_ci.configs import ( + ARCHITECTURE, + DEFAULT_ARCHITECTURE, + DEFAULT_PYTHON_TAG_VERSION, + PYTHON_VERSIONS, +) +from ci.ray_ci.docker_container import ( + ARCHITECTURES_RAY, + ARCHITECTURES_RAY_LLM, + ARCHITECTURES_RAY_ML, + GPU_PLATFORM, + PLATFORMS_RAY, + PLATFORMS_RAY_LLM, + PLATFORMS_RAY_ML, + PYTHON_VERSIONS_RAY, + PYTHON_VERSIONS_RAY_LLM, + PYTHON_VERSIONS_RAY_ML, + RAY_REPO_MAP, + RayType, +) +from ci.ray_ci.utils import ecr_docker_login + +VALID_IMAGE_TYPES = [rt.value for rt in RayType] + +logging.basicConfig( + level=logging.INFO, + format="%(message)s", + stream=sys.stdout, +) +logger = logging.getLogger(__name__) + + +class PushRayImageError(Exception): + """Error raised when pushing ray images fails.""" + + +def compact_cuda_suffix(platform: str) -> str: + """Convert a CUDA platform string to compact suffix (e.g. cu12.1.1-cudnn8 -> -cu121).""" + platform_base = platform.split("-", 1)[0] + parts = platform_base.split(".") + if len(parts) < 2: + raise PushRayImageError(f"Unrecognized GPU platform format: {platform}") + + return f"-{parts[0]}{parts[1]}" + + +class RayImagePushContext: + """Context for publishing a ray image from Wanda cache to Docker Hub.""" + + ray_type: RayType + python_version: str + platform: str + architecture: str + branch: str + commit: str + rayci_schedule: str + rayci_build_id: str + pull_request: str # buildkite uses "false" or number string + # Computed fields (set in __init__) + arch_suffix: str + wanda_tag: str + docker_hub_repo: str + + def __init__( + self, + ray_type: RayType, + python_version: str, + platform: str, + architecture: str, + branch: str, + commit: str, + rayci_schedule: str, + rayci_build_id: str, + pull_request: str, + ) -> None: + self.ray_type = ray_type + self.python_version = python_version + self.platform = platform + self.architecture = architecture + self.branch = branch + self.commit = commit + self.rayci_schedule = rayci_schedule + self.rayci_build_id = rayci_build_id + self.pull_request = pull_request + + arch_suffix = "" if architecture == DEFAULT_ARCHITECTURE else f"-{architecture}" + self.arch_suffix = arch_suffix + self.wanda_tag = f"{rayci_build_id}-{self.wanda_image_name()}" + self.docker_hub_repo = f"rayproject/{RAY_REPO_MAP[self.ray_type.value]}" + + def assert_published_image_type(self) -> None: + invalid_python_version = ( + f"Invalid python version {self.python_version} for {self.ray_type}" + ) + invalid_platform = f"Invalid platform {self.platform} for {self.ray_type}" + invalid_architecture = ( + f"Invalid architecture {self.architecture} for {self.ray_type}" + ) + + if self.ray_type in [RayType.RAY_ML, RayType.RAY_ML_EXTRA]: + assert self.python_version in PYTHON_VERSIONS_RAY_ML, invalid_python_version + assert self.platform in PLATFORMS_RAY_ML, invalid_platform + assert self.architecture in ARCHITECTURES_RAY_ML, invalid_architecture + elif self.ray_type in [RayType.RAY_LLM, RayType.RAY_LLM_EXTRA]: + assert ( + self.python_version in PYTHON_VERSIONS_RAY_LLM + ), invalid_python_version + assert self.platform in PLATFORMS_RAY_LLM, invalid_platform + assert self.architecture in ARCHITECTURES_RAY_LLM, invalid_architecture + else: + # ray or ray-extra + assert self.python_version in PYTHON_VERSIONS_RAY, invalid_python_version + assert self.platform in PLATFORMS_RAY, invalid_platform + assert self.architecture in ARCHITECTURES_RAY, invalid_architecture + + def destination_tags(self) -> List[str]: + """ + Compute the destination tags for this context. + + Tags are formed as: + {version}{variation}{python_suffix}{platform}{architecture_suffix} + + For example: + - nightly.260107.abc123-py310-cpu + - nightly-extra-py310-cu121 + - nightly.260107.abc123-extra-py310-gpu + - 2.53.0.abc123-py310-cu121 + - 2.53.0.abc123-extra-py310-cu121 + """ + tags = [] + for version in self._versions(): + for plat in self._platform_suffixes(): + for py in self._python_suffixes(): + tags.append( + f"{version}{self._variation_suffix()}{py}{plat}{self.arch_suffix}" + ) + return tags + + def wanda_image_name(self) -> str: + """Get the wanda source image name for this context.""" + if self.platform == "cpu": + return ( + f"{self.ray_type.value}-py{self.python_version}-cpu{self.arch_suffix}" + ) + return f"{self.ray_type.value}-py{self.python_version}-{self.platform}{self.arch_suffix}" + + def _versions(self) -> List[str]: + """Compute version tags based on branch/schedule/PR status.""" + is_master = self.branch == "master" + is_nightly = self.rayci_schedule == "nightly" + is_pull_request = self.pull_request != "false" + is_release = self.branch and self.branch.startswith("releases/") + sha_tag = self.commit[:6] + formatted_date = datetime.now().strftime("%y%m%d") + + if is_master and is_nightly: + return [f"nightly.{formatted_date}.{sha_tag}", "nightly"] + elif is_release: + release_name = self.branch[len("releases/") :] + return [f"{release_name}.{sha_tag}"] + elif is_pull_request: + return [f"pr-{self.pull_request}.{sha_tag}", self.rayci_build_id] + else: + return [sha_tag, self.rayci_build_id] + + def _variation_suffix(self) -> str: + """Get -extra suffix for extra image types.""" + if self.ray_type in { + RayType.RAY_EXTRA, + RayType.RAY_ML_EXTRA, + RayType.RAY_LLM_EXTRA, + }: + return "-extra" + return "" + + def _python_suffixes(self) -> List[str]: + """Get python version suffixes (includes empty for default version).""" + suffixes = [f"-py{self.python_version.replace('.', '')}"] + if self.python_version == DEFAULT_PYTHON_TAG_VERSION: + suffixes.append("") + return suffixes + + def _platform_suffixes(self) -> List[str]: + """Get platform suffixes (includes aliases like -gpu for GPU_PLATFORM).""" + if self.platform == "cpu": + suffixes = ["-cpu"] + # no tag is alias to cpu for ray image + if self.ray_type in {RayType.RAY, RayType.RAY_EXTRA}: + suffixes.append("") + return suffixes + + suffixes = [compact_cuda_suffix(self.platform)] + if self.platform == GPU_PLATFORM: + # gpu is alias to GPU_PLATFORM value for ray image + suffixes.append("-gpu") + # no tag is alias to gpu for ray-ml image + if self.ray_type in {RayType.RAY_ML, RayType.RAY_ML_EXTRA}: + suffixes.append("") + + return suffixes + + +def _image_exists(tag: str) -> bool: + """Check if a container image manifest exists using crane.""" + return_code, _ = call_crane_manifest(tag) + return return_code == 0 + + +def _copy_image(reference: str, destination: str, dry_run: bool = False) -> None: + """Copy a container image from source to destination using crane.""" + if dry_run: + logger.info(f"DRY RUN: Would copy {reference} -> {destination}") + return + + logger.info(f"Copying {reference} -> {destination}") + return_code, output = call_crane_copy(reference, destination) + if return_code != 0: + raise PushRayImageError(f"Crane copy failed: {output}") + logger.info(f"Successfully copied to {destination}") + + +@click.command() +@click.option( + "--python-version", type=click.Choice(list(PYTHON_VERSIONS.keys())), required=True +) +@click.option("--platform", type=click.Choice(list(PLATFORMS_RAY)), required=True) +@click.option( + "--image-type", + type=click.Choice(VALID_IMAGE_TYPES), + required=True, +) +@click.option("--architecture", type=click.Choice(ARCHITECTURE), required=True) +@click.option("--rayci-work-repo", type=str, required=True, envvar="RAYCI_WORK_REPO") +@click.option("--rayci-build-id", type=str, required=True, envvar="RAYCI_BUILD_ID") +@click.option("--branch", type=str, required=True, envvar="BUILDKITE_BRANCH") +@click.option("--commit", type=str, required=True, envvar="BUILDKITE_COMMIT") +@click.option("--rayci-schedule", type=str, default="", envvar="RAYCI_SCHEDULE") +@click.option( + "--pull-request", type=str, default="false", envvar="BUILDKITE_PULL_REQUEST" +) +@click.option("--upload", is_flag=True, default=False) +def main( + python_version: str, + platform: str, + image_type: str, + architecture: str, + rayci_work_repo: str, + rayci_build_id: str, + branch: str, + commit: str, + rayci_schedule: str, + pull_request: str, + upload: bool, +) -> None: + """ + Publish a Wanda-cached ray image to Docker Hub. + + Tags are generated matching the original RayDockerContainer format: + {version}{variation}{python_suffix}{platform}{architecture_suffix} + """ + dry_run = not upload + if dry_run: + logger.info("DRY RUN MODE - no images will be pushed") + + ctx = RayImagePushContext( + ray_type=RayType(image_type), + python_version=python_version, + platform=platform, + architecture=architecture, + branch=branch, + commit=commit, + rayci_schedule=rayci_schedule, + rayci_build_id=rayci_build_id, + pull_request=pull_request, + ) + + ctx.assert_published_image_type() + + ecr_registry = rayci_work_repo.split("/")[0] + ecr_docker_login(ecr_registry) + + src_ref = f"{rayci_work_repo}:{ctx.wanda_tag}" + logger.info(f"Verifying source image in Wanda cache: {src_ref}") + if not _image_exists(src_ref): + raise PushRayImageError(f"Source image not found in Wanda cache: {src_ref}") + + for tag in ctx.destination_tags(): + dest_ref = f"{ctx.docker_hub_repo}:{tag}" + _copy_image(src_ref, dest_ref, dry_run=dry_run) + + logger.info( + f"Successfully pushed {ctx.ray_type.value} image with tags: {ctx.destination_tags()}" + ) + + +if __name__ == "__main__": + main() diff --git a/ci/ray_ci/automation/test_push_ray_image.py b/ci/ray_ci/automation/test_push_ray_image.py new file mode 100644 index 000000000000..3021b1de3032 --- /dev/null +++ b/ci/ray_ci/automation/test_push_ray_image.py @@ -0,0 +1,309 @@ +import sys +from unittest import mock + +import pytest + +from ci.ray_ci.automation.push_ray_image import RayImagePushContext, compact_cuda_suffix +from ci.ray_ci.configs import DEFAULT_ARCHITECTURE, DEFAULT_PYTHON_TAG_VERSION +from ci.ray_ci.docker_container import GPU_PLATFORM, RayType + + +def make_ctx(**overrides) -> RayImagePushContext: + """Create a RayImagePushContext with defaults for testing.""" + defaults = { + "ray_type": RayType.RAY, + "python_version": DEFAULT_PYTHON_TAG_VERSION, + "platform": "cpu", + "architecture": DEFAULT_ARCHITECTURE, + "branch": "master", + "commit": "abc123", + "rayci_schedule": "", + "rayci_build_id": "build123", + "pull_request": "false", + } + defaults.update(overrides) + + return RayImagePushContext(**defaults) + + +class TestWandaImageName: + DEFAULT_TEST_CUDA_PLATFORM = "cu12.1.1-cudnn8" + + @pytest.mark.parametrize( + ("ray_type", "python_version", "platform", "architecture", "expected"), + [ + # CPU images + (RayType.RAY, "3.10", "cpu", DEFAULT_ARCHITECTURE, "ray-py3.10-cpu"), + (RayType.RAY, "3.10", "cpu", "aarch64", "ray-py3.10-cpu-aarch64"), + ( + RayType.RAY_EXTRA, + "3.10", + "cpu", + DEFAULT_ARCHITECTURE, + "ray-extra-py3.10-cpu", + ), + # CUDA images + ( + RayType.RAY, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ( + RayType.RAY, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + "aarch64", + f"ray-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}-aarch64", + ), + ( + RayType.RAY_EXTRA, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-extra-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ( + RayType.RAY_LLM, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-llm-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ( + RayType.RAY_LLM_EXTRA, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-llm-extra-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ], + ) + def test_wanda_image_name( + self, ray_type, python_version, platform, architecture, expected + ): + ctx = make_ctx( + ray_type=ray_type, + python_version=python_version, + platform=platform, + architecture=architecture, + ) + assert ctx.wanda_image_name() == expected + + +class TestVariationSuffix: + @pytest.mark.parametrize( + ("ray_type", "expected"), + [ + (RayType.RAY, ""), + (RayType.RAY_EXTRA, "-extra"), + (RayType.RAY_ML, ""), + (RayType.RAY_ML_EXTRA, "-extra"), + (RayType.RAY_LLM, ""), + (RayType.RAY_LLM_EXTRA, "-extra"), + ], + ) + def test_variation_suffix(self, ray_type, expected): + ctx = make_ctx(ray_type=ray_type) + assert ctx._variation_suffix() == expected + + +class TestPythonSuffixes: + @pytest.mark.parametrize( + ("python_version", "expected"), + [ + ( + DEFAULT_PYTHON_TAG_VERSION, + ["-py" + DEFAULT_PYTHON_TAG_VERSION.replace(".", ""), ""], + ), # default gets empty suffix too + ("3.99", ["-py399"]), # non-default gets no empty suffix + ], + ) + def test_python_suffixes(self, python_version, expected): + ctx = make_ctx(python_version=python_version) + assert ctx._python_suffixes() == expected + + +class TestPlatformSuffixes: + @pytest.mark.parametrize( + ("platform", "ray_type", "expected"), + [ + # CPU images + ("cpu", RayType.RAY, ["-cpu", ""]), + ("cpu", RayType.RAY_EXTRA, ["-cpu", ""]), + ("cpu", RayType.RAY_ML, ["-cpu"]), # ray-ml doesn't get empty for cpu + # CUDA images + ("cu11.7.1-cudnn8", RayType.RAY, ["-cu117"]), + ("cu11.8.0-cudnn8", RayType.RAY, ["-cu118"]), + (GPU_PLATFORM, RayType.RAY, [compact_cuda_suffix(GPU_PLATFORM), "-gpu"]), + ( + GPU_PLATFORM, + RayType.RAY_ML, + [compact_cuda_suffix(GPU_PLATFORM), "-gpu", ""], + ), # ray-ml gets empty for GPU_PLATFORM + ], + ) + def test_platform_suffixes(self, platform, ray_type, expected): + ctx = make_ctx(platform=platform, ray_type=ray_type) + assert ctx._platform_suffixes() == expected + + +class TestVersions: + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_master(self, mock_datetime): + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx(branch="master", commit="abc123def456", rayci_schedule="nightly") + assert ctx._versions() == ["nightly.260107.abc123", "nightly"] + + def test_release_branch(self): + ctx = make_ctx(branch="releases/2.44.0", commit="abc123def456") + assert ctx._versions() == ["2.44.0.abc123"] + + def test_pull_request(self): + ctx = make_ctx( + branch="feature-branch", commit="abc123def456", pull_request="12345" + ) + assert ctx._versions() == ["pr-12345.abc123", "build123"] + + def test_other_branch(self): + ctx = make_ctx(branch="feature-branch", commit="abc123def456") + assert ctx._versions() == ["abc123", "build123"] + + +class TestDestinationTags: + """ + Test destination_tags method. + + Tags are formed as: {version}{variation}{python_suffix}{platform}{architecture_suffix} + """ + + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_cpu_default_python(self, mock_datetime): + """Test: nightly.260107.abc123-py310-cpu""" + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx(branch="master", commit="abc123def456", rayci_schedule="nightly") + tags = ctx.destination_tags() + # nightly versions x cpu suffixes x python suffixes + # ["nightly.260107.abc123", "nightly"] x ["-cpu", ""] x ["-py310", ""] + assert "nightly.260107.abc123-py310-cpu" in tags + assert "nightly.260107.abc123-cpu" in tags + assert "nightly.260107.abc123-py310" in tags + assert "nightly.260107.abc123" in tags + assert "nightly-py310-cpu" in tags + assert "nightly-cpu" in tags + assert "nightly-py310" in tags + assert "nightly" in tags + + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_extra_gpu(self, mock_datetime): + """Test: nightly-extra-py310-cu121 and nightly.260107.abc123-extra-py310-gpu""" + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx( + ray_type=RayType.RAY_EXTRA, + platform=GPU_PLATFORM, + branch="master", + commit="abc123def456", + rayci_schedule="nightly", + ) + tags = ctx.destination_tags() + # Should include -extra variation and -gpu alias + assert "nightly.260107.abc123-extra-py310-cu121" in tags + assert "nightly.260107.abc123-extra-py310-gpu" in tags + assert "nightly-extra-py310-cu121" in tags + assert "nightly-extra-py310-gpu" in tags + assert "nightly.260107.abc123-extra-cu121" in tags + assert "nightly-extra-gpu" in tags + + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_gpu_platform_non_default_python(self, mock_datetime): + """Test: nightly.260107.abc123-py311-cu121""" + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx( + python_version="3.11", + platform=GPU_PLATFORM, + branch="master", + commit="abc123def456", + rayci_schedule="nightly", + ) + tags = ctx.destination_tags() + # Should include -cu121, -gpu aliases but NOT empty python suffix (3.11 is not default) + assert "nightly.260107.abc123-py311-cu121" in tags + assert "nightly.260107.abc123-py311-gpu" in tags + assert "nightly-py311-cu121" in tags + assert "nightly-py311-gpu" in tags + # Should NOT have empty python suffix variants + assert "nightly.260107.abc123-cu121" not in tags + assert "nightly-gpu" not in tags + + def test_release_gpu(self): + """Test: 2.53.0.abc123-py310-cu121""" + ctx = make_ctx( + platform=GPU_PLATFORM, branch="releases/2.53.0", commit="abc123def456" + ) + tags = ctx.destination_tags() + assert "2.53.0.abc123-py310-cu121" in tags + assert "2.53.0.abc123-py310-gpu" in tags + # Default python suffix variants + assert "2.53.0.abc123-cu121" in tags + assert "2.53.0.abc123-gpu" in tags + + def test_release_extra_gpu(self): + """Test: 2.53.0.abc123-extra-py310-cu121""" + ctx = make_ctx( + ray_type=RayType.RAY_EXTRA, + platform=GPU_PLATFORM, + branch="releases/2.53.0", + commit="abc123def456", + ) + tags = ctx.destination_tags() + assert "2.53.0.abc123-extra-py310-cu121" in tags + assert "2.53.0.abc123-extra-py310-gpu" in tags + # Default python suffix variants + assert "2.53.0.abc123-extra-cu121" in tags + assert "2.53.0.abc123-extra-gpu" in tags + + def test_release_non_gpu_platform_cuda(self): + """Test release with non-GPU_PLATFORM CUDA version (no -gpu alias).""" + ctx = make_ctx( + python_version="3.11", + platform="cu12.3.2-cudnn9", # Not GPU_PLATFORM + branch="releases/2.44.0", + commit="abc123def456", + ) + tags = ctx.destination_tags() + assert "2.44.0.abc123-py311-cu123" in tags + # Should NOT have -gpu alias since this isn't GPU_PLATFORM + assert "2.44.0.abc123-py311-gpu" not in tags + + def test_release_cpu_aarch64(self): + """Test release with architecture suffix.""" + ctx = make_ctx( + architecture="aarch64", + branch="releases/2.44.0", + commit="abc123def456", + ) + tags = ctx.destination_tags() + assert "2.44.0.abc123-py310-cpu-aarch64" in tags + assert "2.44.0.abc123-cpu-aarch64" in tags + # Empty platform suffix variant (ray cpu alias) + assert "2.44.0.abc123-py310-aarch64" in tags + assert "2.44.0.abc123-aarch64" in tags + + def test_pull_request_tags(self): + """Test PR builds include pr-{number} prefix.""" + ctx = make_ctx(commit="abc123def456", pull_request="12345") + tags = ctx.destination_tags() + assert "pr-12345.abc123-py310-cpu" in tags + assert "build123-py310-cpu" in tags + + def test_feature_branch_non_pr(self): + """Test non-PR feature branch uses sha and build_id.""" + ctx = make_ctx(python_version="3.12", commit="abc123def456") + tags = ctx.destination_tags() + assert "abc123-py312-cpu" in tags + assert "build123-py312-cpu" in tags + + +if __name__ == "__main__": + sys.exit(pytest.main(["-vv", __file__]))