From f2ba8b699053c38679cddcda3762c200b9662f9e Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Thu, 22 Jan 2026 10:30:07 +0100 Subject: [PATCH 1/6] ci: Update Ubuntu version from 22.04 to 24.04 for AMD CI --- ci/cscs-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 2e051e6e7c..5dcbc7b442 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -65,7 +65,7 @@ stages: BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete EXTRA_UV_SYNC_ARGS: "--extra rocm6_0" EXTRA_UV_ENV_VARS: "CUPY_INSTALL_USE_HIP=1 HCC_AMDGPU_TARGET=gfx942 ROCM_HOME=/opt/rocm" - UBUNTU_VERSION: '22.04' + UBUNTU_VERSION: '24.04' build_cscs_gh200: extends: From 28e9350d49e23be9afa00d669f6df90e31ba9089 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Thu, 22 Jan 2026 13:39:25 +0100 Subject: [PATCH 2/6] try manipulating LD_LIBRARY_PATH --- ci/cscs-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 5dcbc7b442..454e1be38d 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -121,8 +121,8 @@ build_cscs_amd_rocm: # The output folder is inside the docker image (${WORKDIR}/gt4py), so it won't take up # space on $SCRATCH but in a ephemeral tmpfs mount in the running node. # for git>=2.49: mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 --revision "${CI_COMMIT_SHA}" "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py" - - mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py" - - cd "${WORKDIR}/gt4py" && git fetch --depth 1 origin "${CI_COMMIT_SHA}" && git checkout "${CI_COMMIT_SHA}" + - LD_LIBRARY_PATH=/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py" + - LD_LIBRARY_PATH=/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH cd "${WORKDIR}/gt4py" && git fetch --depth 1 origin "${CI_COMMIT_SHA}" && git checkout "${CI_COMMIT_SHA}" - export NOX_SESSION_ARGS="${VARIANT:+($VARIANT}${SUBVARIANT:+, $SUBVARIANT}${DETAIL:+, $DETAIL}${VARIANT:+)}" - cd "${WORKDIR}/gt4py" && ./noxfile.py -s "test_${SUBPACKAGE}-${PY_VERSION}${NOX_SESSION_ARGS}" From af8bcebddea7df8cc15edd618db6d2b6924e6648 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Tue, 3 Feb 2026 09:38:42 +0100 Subject: [PATCH 3/6] Refactor CI configuration for git clone commands --- ci/cscs-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 454e1be38d..6837cac4e3 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -88,6 +88,7 @@ build_cscs_amd_rocm: variables: TEST_VARIANTS: 'cpu' # Extended jobs should redefine which variants (cpu, cuda12, rocm6_0) to test CSCS_CUDA_MPS: 1 + USE_MPI: 0 # TODO(havogt): to workaround the libfabric hook injecting incompatible libraries SLURM_JOB_NUM_NODES: 1 SLURM_TIMELIMIT: 5 parallel: @@ -121,8 +122,8 @@ build_cscs_amd_rocm: # The output folder is inside the docker image (${WORKDIR}/gt4py), so it won't take up # space on $SCRATCH but in a ephemeral tmpfs mount in the running node. # for git>=2.49: mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 --revision "${CI_COMMIT_SHA}" "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py" - - LD_LIBRARY_PATH=/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py" - - LD_LIBRARY_PATH=/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH cd "${WORKDIR}/gt4py" && git fetch --depth 1 origin "${CI_COMMIT_SHA}" && git checkout "${CI_COMMIT_SHA}" + - mkdir -p "${WORKDIR}/gt4py" && git clone --depth 1 "${CSCS_CI_ORIG_CLONE_URL}" "${WORKDIR}/gt4py" + - cd "${WORKDIR}/gt4py" && git fetch --depth 1 origin "${CI_COMMIT_SHA}" && git checkout "${CI_COMMIT_SHA}" - export NOX_SESSION_ARGS="${VARIANT:+($VARIANT}${SUBVARIANT:+, $SUBVARIANT}${DETAIL:+, $DETAIL}${VARIANT:+)}" - cd "${WORKDIR}/gt4py" && ./noxfile.py -s "test_${SUBPACKAGE}-${PY_VERSION}${NOX_SESSION_ARGS}" From 262ffa26a82ea3e403bed99ef04c49bc8fef0c70 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Thu, 5 Feb 2026 11:55:53 +0100 Subject: [PATCH 4/6] Apply suggestion from @havogt --- ci/cscs-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 6837cac4e3..05c8a24caf 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -65,7 +65,6 @@ stages: BASE_IMAGE: jfrog.svc.cscs.ch/dockerhub/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete EXTRA_UV_SYNC_ARGS: "--extra rocm6_0" EXTRA_UV_ENV_VARS: "CUPY_INSTALL_USE_HIP=1 HCC_AMDGPU_TARGET=gfx942 ROCM_HOME=/opt/rocm" - UBUNTU_VERSION: '24.04' build_cscs_gh200: extends: From 4f9bd3cffc6f92b213e459e77bdbb1d86d670e37 Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Tue, 10 Feb 2026 10:36:03 +0100 Subject: [PATCH 5/6] Update cscs-ci.yml --- ci/cscs-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 05c8a24caf..5dd7fdfdc7 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -86,7 +86,6 @@ build_cscs_amd_rocm: image: ${CSCS_REGISTRY_PATH}/public/${ARCH}/base/gt4py-ci-${PY_VERSION}:${DOCKER_TAG} variables: TEST_VARIANTS: 'cpu' # Extended jobs should redefine which variants (cpu, cuda12, rocm6_0) to test - CSCS_CUDA_MPS: 1 USE_MPI: 0 # TODO(havogt): to workaround the libfabric hook injecting incompatible libraries SLURM_JOB_NUM_NODES: 1 SLURM_TIMELIMIT: 5 @@ -137,6 +136,7 @@ test_cscs_gh200: SLURM_GPUS_PER_NODE: 1 SLURM_PARTITION: 'shared' GT4PY_BUILD_JOBS: 8 + CSCS_CUDA_MPS: 1 # Limit test parallelism to avoid "OSError: too many open files" in the gt4py build stage. PYTEST_XDIST_AUTO_NUM_WORKERS: 32 rules: From d664d14b02dda67f3d936a82eb7cca1930d9441b Mon Sep 17 00:00:00 2001 From: Hannes Vogt Date: Wed, 11 Feb 2026 08:41:24 +0100 Subject: [PATCH 6/6] Disable MPS for santis pipeline --- ci/cscs-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/cscs-ci.yml b/ci/cscs-ci.yml index 5dd7fdfdc7..1bc98b5d45 100644 --- a/ci/cscs-ci.yml +++ b/ci/cscs-ci.yml @@ -136,7 +136,6 @@ test_cscs_gh200: SLURM_GPUS_PER_NODE: 1 SLURM_PARTITION: 'shared' GT4PY_BUILD_JOBS: 8 - CSCS_CUDA_MPS: 1 # Limit test parallelism to avoid "OSError: too many open files" in the gt4py build stage. PYTEST_XDIST_AUTO_NUM_WORKERS: 32 rules: