From 1fd638925fdc6caf240998d55546a45493ed8027 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 28 Jan 2026 14:36:01 +0100 Subject: [PATCH 01/29] Attempt to add cuda support to distributed ci pipeline --- ci/distributed.yml | 3 ++- ci/docker/base_mpi.Dockerfile | 36 ++++++++++++++++--------------- ci/docker/checkout_mpi.Dockerfile | 2 +- scripts/ci-mpi-wrapper.sh | 2 ++ 4 files changed, 24 insertions(+), 19 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 8b173b22b0..d8f8b9e920 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -80,7 +80,8 @@ build_distributed_cpu: parallel: matrix: - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common] - BACKEND: [embedded, gtfn_cpu, dace_cpu] + # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu] + BACKEND: [dace_cpu, dace_gpu] rules: - if: $COMPONENT == 'atmosphere/diffusion' variables: diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index 3fcdb21297..914b556136 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -4,23 +4,25 @@ ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \ - strace \ - build-essential \ - tar \ - wget \ - curl \ - libboost-dev \ - libnuma-dev \ - libopenmpi-dev \ - ca-certificates \ - libssl-dev \ - autoconf \ - automake \ - libtool \ - pkg-config \ - libreadline-dev \ - git && \ +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + build-essential \ + ca-certificates \ + curl \ + git \ + libboost-dev \ + libnuma-dev \ + libopenmpi-dev \ + libreadline-dev \ + libssl-dev \ + libtool \ + nvidia-cuda-dev \ + pkg-config \ + strace \ + tar \ + wget && \ rm -rf /var/lib/apt/lists/* # Install uv: https://docs.astral.sh/uv/guides/integration/docker diff --git a/ci/docker/checkout_mpi.Dockerfile b/ci/docker/checkout_mpi.Dockerfile index c229d6c374..62ea5daeae 100644 --- a/ci/docker/checkout_mpi.Dockerfile +++ b/ci/docker/checkout_mpi.Dockerfile @@ -8,4 +8,4 @@ ARG PYVERSION ARG VENV ENV UV_PROJECT_ENVIRONMENT=$VENV ENV MPI4PY_BUILD_BACKEND="scikit-build-core" -RUN uv sync --extra distributed --python=$PYVERSION +RUN uv sync --extra all --python=$PYVERSION diff --git a/scripts/ci-mpi-wrapper.sh b/scripts/ci-mpi-wrapper.sh index 900dd340ae..c0aa25d41f 100755 --- a/scripts/ci-mpi-wrapper.sh +++ b/scripts/ci-mpi-wrapper.sh @@ -17,6 +17,8 @@ else exit 1 fi +export CUDA_VISIBLE_DEVICES="${rank}" + log_file="${CI_PROJECT_DIR:+${CI_PROJECT_DIR}/}pytest-log-rank-${rank}.txt" if [[ "${rank}" -eq 0 ]]; then From cbb1891e84a85b316a550b81d497021313244190 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 28 Jan 2026 17:03:44 +0100 Subject: [PATCH 02/29] Add cuda12 extra --- ci/docker/checkout_mpi.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/checkout_mpi.Dockerfile b/ci/docker/checkout_mpi.Dockerfile index 62ea5daeae..4cbf1d32c0 100644 --- a/ci/docker/checkout_mpi.Dockerfile +++ b/ci/docker/checkout_mpi.Dockerfile @@ -8,4 +8,4 @@ ARG PYVERSION ARG VENV ENV UV_PROJECT_ENVIRONMENT=$VENV ENV MPI4PY_BUILD_BACKEND="scikit-build-core" -RUN uv sync --extra all --python=$PYVERSION +RUN uv sync --extra all --extra cuda12 --python=$PYVERSION From bbb151cbef4a93e65a6ccb451bc9f24e10dfd36c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 28 Jan 2026 18:02:02 +0100 Subject: [PATCH 03/29] Add nvidia-cuda-toolkit --- ci/docker/base_mpi.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index 914b556136..92cb700e22 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -19,6 +19,7 @@ RUN apt-get update && \ libssl-dev \ libtool \ nvidia-cuda-dev \ + nvidia-cuda-toolkit \ pkg-config \ strace \ tar \ From b9be7fb076c60cc495ce92aa8719b2d2032269a3 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 29 Jan 2026 13:02:09 +0100 Subject: [PATCH 04/29] Revert "refactor: testing infrastructure (#1002)" This reverts commit e30c2f71e668952698fd93e3ce1a1c054029ea6c. --- .../model/common/utils/device_utils.py | 3 --- .../icon4py/model/testing/data_handling.py | 23 +++-------------- .../model/testing/fixtures/datatest.py | 25 +++++++++++++++++-- .../icon4py/model/testing/stencil_tests.py | 17 ++++++------- 4 files changed, 33 insertions(+), 35 deletions(-) diff --git a/model/common/src/icon4py/model/common/utils/device_utils.py b/model/common/src/icon4py/model/common/utils/device_utils.py index 360a53902a..cacfc8eb64 100644 --- a/model/common/src/icon4py/model/common/utils/device_utils.py +++ b/model/common/src/icon4py/model/common/utils/device_utils.py @@ -37,9 +37,6 @@ def sync(allocator: gtx_typing.FieldBufferAllocationUtil | None = None) -> None: Note: this is and ad-hoc interface, maybe the function should get the device to sync for. """ - # Type annotation already describes that only these types are allowed, but mypy coverage is not great. - # The explicit assert avoids critical mistakes in using this function. - assert allocator is None or gtx_allocators.is_field_allocation_tool(allocator) if allocator is not None and is_cupy_device(allocator): cp.cuda.runtime.deviceSynchronize() diff --git a/model/testing/src/icon4py/model/testing/data_handling.py b/model/testing/src/icon4py/model/testing/data_handling.py index 9624c64839..9ecf932335 100644 --- a/model/testing/src/icon4py/model/testing/data_handling.py +++ b/model/testing/src/icon4py/model/testing/data_handling.py @@ -6,13 +6,11 @@ # Please, refer to the LICENSE file in the root directory. # SPDX-License-Identifier: BSD-3-Clause -import pathlib import tarfile +from pathlib import Path -from icon4py.model.testing import config, locking - -def download_and_extract(uri: str, dst: pathlib.Path, data_file: str = "downloaded.tar.gz") -> None: +def download_and_extract(uri: str, dst: Path, data_file: str = "downloaded.tar.gz") -> None: """ Download data archive from remote server. @@ -33,19 +31,4 @@ def download_and_extract(uri: str, dst: pathlib.Path, data_file: str = "download raise OSError(f"{data_file} needs to be a valid tar file") with tarfile.open(data_file, mode="r:*") as tf: tf.extractall(path=dst) - pathlib.Path(data_file).unlink(missing_ok=True) - - -def download_test_data(dst: pathlib.Path, uri: str) -> None: - if config.ENABLE_TESTDATA_DOWNLOAD: - # We create and lock the *parent* directory as we later check for existence of `dst`. - dst.parent.mkdir(parents=True, exist_ok=True) - with locking.lock(dst.parent): - if not dst.exists(): - download_and_extract(uri, dst) - else: - # If test data download is disabled, we check if the directory exists - # without locking. We assume the location is managed by the user - # and avoid locking shared directories (e.g. on CI). - if not dst.exists(): - raise RuntimeError(f"Test data {dst} does not exist, and downloading is disabled.") + Path(data_file).unlink(missing_ok=True) diff --git a/model/testing/src/icon4py/model/testing/fixtures/datatest.py b/model/testing/src/icon4py/model/testing/fixtures/datatest.py index c1d17332e9..28483172a1 100644 --- a/model/testing/src/icon4py/model/testing/fixtures/datatest.py +++ b/model/testing/src/icon4py/model/testing/fixtures/datatest.py @@ -17,7 +17,13 @@ from icon4py.model.common import model_backends, model_options from icon4py.model.common.constants import RayleighType from icon4py.model.common.grid import base as base_grid -from icon4py.model.testing import data_handling as data, datatest_utils as dt_utils, definitions +from icon4py.model.testing import ( + config, + data_handling as data, + datatest_utils as dt_utils, + definitions, + locking, +) if TYPE_CHECKING: @@ -119,7 +125,22 @@ def _download_ser_data( try: destination_path = dt_utils.get_datapath_for_experiment(_ranked_data_path, _experiment) uri = _experiment.partitioned_data[comm_size] - data.download_test_data(destination_path, uri) + + data_file = _ranked_data_path.joinpath(f"{_experiment.name}_mpitask{comm_size}.tar.gz").name + _ranked_data_path.mkdir(parents=True, exist_ok=True) + if config.ENABLE_TESTDATA_DOWNLOAD: + with locking.lock(_ranked_data_path): + # Note: if the lock would be created for `destination_path` it would always exist... + if not destination_path.exists(): + data.download_and_extract(uri, _ranked_data_path, data_file) + else: + # If test data download is disabled, we check if the directory exists + # without locking. We assume the location is managed by the user + # and avoid locking shared directories (e.g. on CI). + if not destination_path.exists(): + raise RuntimeError( + f"Serialization data {data_file} does not exist, and downloading is disabled." + ) except KeyError as err: raise RuntimeError( f"No data for communicator of size {comm_size} exists, use 1, 2 or 4" diff --git a/model/testing/src/icon4py/model/testing/stencil_tests.py b/model/testing/src/icon4py/model/testing/stencil_tests.py index ad1bf5e0ac..f83798f029 100644 --- a/model/testing/src/icon4py/model/testing/stencil_tests.py +++ b/model/testing/src/icon4py/model/testing/stencil_tests.py @@ -21,7 +21,6 @@ config as gtx_config, constructors, metrics as gtx_metrics, - named_collections as gtx_named_collections, typing as gtx_typing, ) @@ -35,15 +34,13 @@ def allocate_data( allocator: gtx_typing.FieldBufferAllocationUtil | None, - input_data: dict[ - str, Any - ], # `Field`s or collection of `Field`s are re-allocated, the rest is passed through -) -> dict[str, Any]: - def _allocate_field(f: gtx.Field) -> gtx.Field: - return constructors.as_field(domain=f.domain, data=f.ndarray, allocator=allocator) - + input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]], +) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]: + _allocate_field = constructors.as_field.partial(allocator=allocator) # type:ignore[attr-defined] # TODO(havogt): check why it doesn't understand the fluid_partial input_data = { - k: gtx_named_collections.tree_map_named_collection(_allocate_field)(v) + k: tuple(_allocate_field(domain=field.domain, data=field.ndarray) for field in v) + if isinstance(v, tuple) + else _allocate_field(domain=v.domain, data=v.ndarray) if not gtx.is_scalar_type(v) and k != "domain" else v for k, v in input_data.items() @@ -210,7 +207,7 @@ def _properly_allocated_input_data( self, input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]], backend_like: model_backends.BackendLike, - ) -> dict[str, Any]: + ) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]: # TODO(havogt): this is a workaround, # because in the `input_data` fixture provided by the user # it does not allocate for the correct device. From 731283a76200caadf5ca5c19ac68c26c79949ef5 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 29 Jan 2026 14:00:57 +0100 Subject: [PATCH 05/29] Use cxi hook in ci --- ci/distributed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index d8f8b9e920..00953956a1 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -38,7 +38,7 @@ build_distributed_baseimage_aarch64: DOCKERFILE: ci/docker/checkout_mpi.Dockerfile DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]' PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi - USE_MPI: NO + USE_MPI: YES SLURM_MPI_TYPE: pmix PMIX_MCA_psec: native PMIX_MCA_gds: "^shmem2" From ea2b3aa7bbfddde4b32bee52ac857b999fec5884 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 29 Jan 2026 14:24:53 +0100 Subject: [PATCH 06/29] Try mpich --- ci/distributed.yml | 7 ++++--- ci/docker/base_mpi.Dockerfile | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 00953956a1..5f58839466 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -39,9 +39,10 @@ build_distributed_baseimage_aarch64: DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]' PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi USE_MPI: YES - SLURM_MPI_TYPE: pmix - PMIX_MCA_psec: native - PMIX_MCA_gds: "^shmem2" + SLURM_MPI_TYPE: pmi2 + # SLURM_MPI_TYPE: pmix + # PMIX_MCA_psec: native + # PMIX_MCA_gds: "^shmem2" .build_distributed_cpu: extends: [.build_distributed_template] diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index 92cb700e22..d7c6b379c5 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -13,8 +13,8 @@ RUN apt-get update && \ curl \ git \ libboost-dev \ + libmpich-dev \ libnuma-dev \ - libopenmpi-dev \ libreadline-dev \ libssl-dev \ libtool \ From 8f04d362b80a7f07ade11bf9ebf48f951b5c9f5c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 29 Jan 2026 14:25:13 +0100 Subject: [PATCH 07/29] Reduce tests --- ci/distributed.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 5f58839466..7f75ebc63b 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -80,7 +80,8 @@ build_distributed_cpu: - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT parallel: matrix: - - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common] + # - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common] + - COMPONENT: [common] # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu] BACKEND: [dace_cpu, dace_gpu] rules: From 9f96b70edce78ffefdda8cd00b82ee7a886fcd43 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 29 Jan 2026 18:48:31 +0100 Subject: [PATCH 08/29] Try using manually built openmpi --- ci/distributed.yml | 9 ++--- ci/docker/base_mpi.Dockerfile | 75 ++++++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 7f75ebc63b..4d4d518b58 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -38,11 +38,10 @@ build_distributed_baseimage_aarch64: DOCKERFILE: ci/docker/checkout_mpi.Dockerfile DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]' PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi - USE_MPI: YES - SLURM_MPI_TYPE: pmi2 - # SLURM_MPI_TYPE: pmix - # PMIX_MCA_psec: native - # PMIX_MCA_gds: "^shmem2" + USE_MPI: NO + SLURM_MPI_TYPE: pmix + PMIX_MCA_psec: native + PMIX_MCA_gds: "^shmem2" .build_distributed_cpu: extends: [.build_distributed_template] diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index d7c6b379c5..bc18fd95fe 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -13,18 +13,91 @@ RUN apt-get update && \ curl \ git \ libboost-dev \ - libmpich-dev \ + libconfig-dev \ + libcurl4-openssl-dev \ + libfuse-dev \ + libjson-c-dev \ + libnl-3-dev \ libnuma-dev \ libreadline-dev \ + libsensors-dev \ libssl-dev \ libtool \ + libuv1-dev \ + libyaml-dev \ nvidia-cuda-dev \ nvidia-cuda-toolkit \ pkg-config \ + python3 \ strace \ tar \ wget && \ rm -rf /var/lib/apt/lists/* +# Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use on Alps. +ARG gdrcopy_version=2.5.1 +RUN set -eux; \ + git clone --depth 1 --branch "v${gdrcopy_version}" https://github.com/NVIDIA/gdrcopy.git; \ + cd gdrcopy; \ + make lib -j"$(nproc)" lib_install; \ + cd /; \ + rm -rf /gdrcopy; \ + ldconfig + +ARG cassini_headers_version=release/shs-13.0.0 +RUN set -eux; \ + git clone --depth 1 --branch "${cassini_headers_version}" https://github.com/HewlettPackard/shs-cassini-headers.git; \ + cd shs-cassini-headers; \ + cp -r include/* /usr/include/; \ + cp -r share/* /usr/share/; \ + rm -rf /shs-cassini-headers + +ARG cxi_driver_version=release/shs-13.0.0 +RUN set -eux; \ + git clone --depth 1 --branch "${cxi_driver_version}" https://github.com/HewlettPackard/shs-cxi-driver.git; \ + cd shs-cxi-driver; \ + cp -r include/* /usr/include/; \ + rm -rf /shs-cxi-driver + +ARG libcxi_version=release/shs-13.0.0 +RUN set -eux; \ + git clone --depth 1 --branch "${libcxi_version}" https://github.com/HewlettPackard/shs-libcxi.git; \ + cd shs-libcxi; \ + ./autogen.sh; \ + ./configure \ + --with-cuda; \ + make -j"$(nproc)" install; \ + cd /; \ + rm -rf /shs-libcxi; \ + ldconfig + +ARG libfabric_version=v2.4.0 +RUN set -eux; \ + git clone --depth 1 --branch "${libfabric_version}" https://github.com/ofiwg/libfabric.git; \ + cd libfabric; \ + ./autogen.sh; \ + ./configure \ + --with-cuda \ + --enable-cuda-dlopen \ + --enable-gdrcopy-dlopen \ + --enable-cxi; \ + make -j"$(nproc)" install; \ + cd /; \ + rm -rf /libfabric; \ + ldconfig + +ARG openmpi_version=5.0.9 +RUN set -eux; \ + curl -fsSL "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${openmpi_version}.tar.gz" -o /tmp/ompi.tar.gz; \ + tar -C /tmp -xzf /tmp/ompi.tar.gz; \ + cd "/tmp/openmpi-${openmpi_version}"; \ + ./configure \ + --with-ofi \ + --with-cuda=/usr; \ + make -j"$(nproc)" install; \ + cd /; \ + rm -rf "/tmp/openmpi-${openmpi_version}" /tmp/ompi.tar.gz; \ + ldconfig + # Install uv: https://docs.astral.sh/uv/guides/integration/docker COPY --from=ghcr.io/astral-sh/uv:0.9.24@sha256:816fdce3387ed2142e37d2e56e1b1b97ccc1ea87731ba199dc8a25c04e4997c5 /uv /uvx /bin/ From 9fce9b55efbda6dbe3ea10996bb54b96f8569d81 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 30 Jan 2026 13:01:12 +0100 Subject: [PATCH 09/29] Debugging --- ci/distributed.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/distributed.yml b/ci/distributed.yml index 4d4d518b58..9d545192cc 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -76,6 +76,8 @@ build_distributed_cpu: - source ${UV_PROJECT_ENVIRONMENT}/bin/activate - echo "running with $(python --version)" script: + - printenv + - echo USE_MPI=\${USE_MPI} - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT parallel: matrix: From c6a767ed9a1a1ec893fc3943f5f4a2e686f6bee9 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 30 Jan 2026 15:22:32 +0100 Subject: [PATCH 10/29] Remove debug prints --- ci/distributed.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 9d545192cc..4d4d518b58 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -76,8 +76,6 @@ build_distributed_cpu: - source ${UV_PROJECT_ENVIRONMENT}/bin/activate - echo "running with $(python --version)" script: - - printenv - - echo USE_MPI=\${USE_MPI} - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT parallel: matrix: From adb1ee6fda08bb5cda894c49634eb1a63656679a Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 30 Jan 2026 15:28:18 +0100 Subject: [PATCH 11/29] Unrevert test download changes --- .../icon4py/model/common/utils/device_utils.py | 3 +++ .../src/icon4py/model/testing/data_handling.py | 5 +++-- .../icon4py/model/testing/fixtures/datatest.py | 8 +------- .../src/icon4py/model/testing/stencil_tests.py | 17 ++++++++++------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/model/common/src/icon4py/model/common/utils/device_utils.py b/model/common/src/icon4py/model/common/utils/device_utils.py index cacfc8eb64..360a53902a 100644 --- a/model/common/src/icon4py/model/common/utils/device_utils.py +++ b/model/common/src/icon4py/model/common/utils/device_utils.py @@ -37,6 +37,9 @@ def sync(allocator: gtx_typing.FieldBufferAllocationUtil | None = None) -> None: Note: this is and ad-hoc interface, maybe the function should get the device to sync for. """ + # Type annotation already describes that only these types are allowed, but mypy coverage is not great. + # The explicit assert avoids critical mistakes in using this function. + assert allocator is None or gtx_allocators.is_field_allocation_tool(allocator) if allocator is not None and is_cupy_device(allocator): cp.cuda.runtime.deviceSynchronize() diff --git a/model/testing/src/icon4py/model/testing/data_handling.py b/model/testing/src/icon4py/model/testing/data_handling.py index c490c8981b..95bc8b8369 100644 --- a/model/testing/src/icon4py/model/testing/data_handling.py +++ b/model/testing/src/icon4py/model/testing/data_handling.py @@ -9,10 +9,11 @@ import os import pathlib import tarfile -from pathlib import Path +from icon4py.model.testing import config, locking -def download_and_extract(uri: str, dst: Path, data_file: str = "downloaded.tar.gz") -> None: + +def download_and_extract(uri: str, dst: pathlib.Path, data_file: str = "downloaded.tar.gz") -> None: """ Download data archive from remote server. diff --git a/model/testing/src/icon4py/model/testing/fixtures/datatest.py b/model/testing/src/icon4py/model/testing/fixtures/datatest.py index 057235b1eb..0727c962ed 100644 --- a/model/testing/src/icon4py/model/testing/fixtures/datatest.py +++ b/model/testing/src/icon4py/model/testing/fixtures/datatest.py @@ -17,13 +17,7 @@ from icon4py.model.common import model_backends, model_options from icon4py.model.common.constants import RayleighType from icon4py.model.common.grid import base as base_grid -from icon4py.model.testing import ( - config, - data_handling as data, - datatest_utils as dt_utils, - definitions, - locking, -) +from icon4py.model.testing import data_handling as data, datatest_utils as dt_utils, definitions if TYPE_CHECKING: diff --git a/model/testing/src/icon4py/model/testing/stencil_tests.py b/model/testing/src/icon4py/model/testing/stencil_tests.py index f83798f029..ad1bf5e0ac 100644 --- a/model/testing/src/icon4py/model/testing/stencil_tests.py +++ b/model/testing/src/icon4py/model/testing/stencil_tests.py @@ -21,6 +21,7 @@ config as gtx_config, constructors, metrics as gtx_metrics, + named_collections as gtx_named_collections, typing as gtx_typing, ) @@ -34,13 +35,15 @@ def allocate_data( allocator: gtx_typing.FieldBufferAllocationUtil | None, - input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]], -) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]: - _allocate_field = constructors.as_field.partial(allocator=allocator) # type:ignore[attr-defined] # TODO(havogt): check why it doesn't understand the fluid_partial + input_data: dict[ + str, Any + ], # `Field`s or collection of `Field`s are re-allocated, the rest is passed through +) -> dict[str, Any]: + def _allocate_field(f: gtx.Field) -> gtx.Field: + return constructors.as_field(domain=f.domain, data=f.ndarray, allocator=allocator) + input_data = { - k: tuple(_allocate_field(domain=field.domain, data=field.ndarray) for field in v) - if isinstance(v, tuple) - else _allocate_field(domain=v.domain, data=v.ndarray) + k: gtx_named_collections.tree_map_named_collection(_allocate_field)(v) if not gtx.is_scalar_type(v) and k != "domain" else v for k, v in input_data.items() @@ -207,7 +210,7 @@ def _properly_allocated_input_data( self, input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]], backend_like: model_backends.BackendLike, - ) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]: + ) -> dict[str, Any]: # TODO(havogt): this is a workaround, # because in the `input_data` fixture provided by the user # it does not allocate for the correct device. From b0321e77e07460784e93008beaeff3bc4fcffc64 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 30 Jan 2026 15:43:10 +0100 Subject: [PATCH 12/29] Numpy/cupy issues Make revert_repeated_index_to_invalid numpy-only as it's not usefully vectorized --- model/common/src/icon4py/model/common/grid/utils.py | 10 +++++----- model/testing/src/icon4py/model/testing/serialbox.py | 7 ++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/model/common/src/icon4py/model/common/grid/utils.py b/model/common/src/icon4py/model/common/grid/utils.py index 39b48c9dd5..4af7b0a6ba 100644 --- a/model/common/src/icon4py/model/common/grid/utils.py +++ b/model/common/src/icon4py/model/common/grid/utils.py @@ -12,14 +12,14 @@ from icon4py.model.common.grid import gridfile -def revert_repeated_index_to_invalid(offset: np.ndarray, array_ns: ModuleType): +def revert_repeated_index_to_invalid(offset: np.ndarray): num_elements = offset.shape[0] for i in range(num_elements): # convert repeated indices back into -1 - for val in array_ns.flip(offset[i, :]): - if array_ns.count_nonzero(val == offset[i, :]) > 1: - unique_values, counts = array_ns.unique(offset[i, :], return_counts=True) + for val in np.flip(offset[i, :]): + if np.count_nonzero(val == offset[i, :]) > 1: + unique_values, counts = np.unique(offset[i, :], return_counts=True) rep_values = unique_values[counts > 1] - rep_indices = array_ns.where(array_ns.isin(offset[i, :], rep_values))[0] + rep_indices = np.where(np.isin(offset[i, :], rep_values))[0] offset[i, rep_indices[1:]] = gridfile.GridFile.INVALID_INDEX return offset diff --git a/model/testing/src/icon4py/model/testing/serialbox.py b/model/testing/src/icon4py/model/testing/serialbox.py index be4edf41dd..05a3fc53fe 100644 --- a/model/testing/src/icon4py/model/testing/serialbox.py +++ b/model/testing/src/icon4py/model/testing/serialbox.py @@ -72,7 +72,7 @@ def wrapper(self, *args, **kwargs): # as a workaround for the lack of support for optional fields in gt4py. shp = (1,) * len(dims) return gtx.as_field( - dims, np.zeros(shp, dtype=dtype), allocator=self.backend + dims, self.xp.zeros(shp, dtype=dtype), allocator=self.backend ) else: return None @@ -503,10 +503,7 @@ def construct_icon_grid( def potentially_revert_icon_index_transformation(ar): return ar else: - potentially_revert_icon_index_transformation = functools.partial( - grid_utils.revert_repeated_index_to_invalid, - array_ns=data_alloc.import_array_ns(backend), - ) + potentially_revert_icon_index_transformation = grid_utils.revert_repeated_index_to_invalid c2e2c = self.c2e2c() e2c2e = potentially_revert_icon_index_transformation(self.e2c2e()) From c62979c718f488c719a59bb997ea56584adbd684 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 30 Jan 2026 16:32:43 +0100 Subject: [PATCH 13/29] Enable shm, lnx, xpmem support in libfabric --- ci/docker/base_mpi.Dockerfile | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index bc18fd95fe..6d00d12db9 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -71,6 +71,19 @@ RUN set -eux; \ rm -rf /shs-libcxi; \ ldconfig +ARG xpmem_version=0d0bad4e1d07b38d53ecc8f20786bb1328c446da +RUN set -eux; \ + git clone https://github.com/hpc/xpmem.git; \ + cd xpmem; \ + git checkout "${xpmem_version}"; \ + ./autogen.sh; \ + ./configure --disable-kernel-module; \ + make -j"$(nproc)" install; \ + cd /; \ + rm -rf /xpmem; \ + ldconfig + +# NOTE: xpmem is not found correctly without setting the prefix in --enable-xpmem ARG libfabric_version=v2.4.0 RUN set -eux; \ git clone --depth 1 --branch "${libfabric_version}" https://github.com/ofiwg/libfabric.git; \ @@ -80,7 +93,11 @@ RUN set -eux; \ --with-cuda \ --enable-cuda-dlopen \ --enable-gdrcopy-dlopen \ - --enable-cxi; \ + --enable-xpmem=/usr \ + --enable-tcp \ + --enable-cxi \ + --enable-lnx \ + --enable-shm; \ make -j"$(nproc)" install; \ cd /; \ rm -rf /libfabric; \ From b4071d03f696503dbe4e158c7308372bca3a3362 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 30 Jan 2026 16:45:50 +0100 Subject: [PATCH 14/29] Linting --- model/common/src/icon4py/model/common/grid/utils.py | 1 - model/testing/src/icon4py/model/testing/serialbox.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/model/common/src/icon4py/model/common/grid/utils.py b/model/common/src/icon4py/model/common/grid/utils.py index 4af7b0a6ba..dbb3d69449 100644 --- a/model/common/src/icon4py/model/common/grid/utils.py +++ b/model/common/src/icon4py/model/common/grid/utils.py @@ -5,7 +5,6 @@ # # Please, refer to the LICENSE file in the root directory. # SPDX-License-Identifier: BSD-3-Clause -from types import ModuleType import numpy as np diff --git a/model/testing/src/icon4py/model/testing/serialbox.py b/model/testing/src/icon4py/model/testing/serialbox.py index 05a3fc53fe..3bb52a9ed1 100644 --- a/model/testing/src/icon4py/model/testing/serialbox.py +++ b/model/testing/src/icon4py/model/testing/serialbox.py @@ -503,7 +503,9 @@ def construct_icon_grid( def potentially_revert_icon_index_transformation(ar): return ar else: - potentially_revert_icon_index_transformation = grid_utils.revert_repeated_index_to_invalid + potentially_revert_icon_index_transformation = ( + grid_utils.revert_repeated_index_to_invalid + ) c2e2c = self.c2e2c() e2c2e = potentially_revert_icon_index_transformation(self.e2c2e()) From 6eb3d8d4379b10a9efedeadc84888b54c8e48852 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 30 Jan 2026 19:47:59 +0100 Subject: [PATCH 15/29] Enable GPU support for GHEX --- ci/docker/checkout_mpi.Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/docker/checkout_mpi.Dockerfile b/ci/docker/checkout_mpi.Dockerfile index 4cbf1d32c0..01e26702b4 100644 --- a/ci/docker/checkout_mpi.Dockerfile +++ b/ci/docker/checkout_mpi.Dockerfile @@ -7,5 +7,9 @@ WORKDIR /icon4py ARG PYVERSION ARG VENV ENV UV_PROJECT_ENVIRONMENT=$VENV -ENV MPI4PY_BUILD_BACKEND="scikit-build-core" +ENV MPI4PY_BUILD_BACKEND=scikit-build-core +ENV GHEX_USE_GPU=ON +ENV GHEX_GPU_TYPE=NVIDIA +ENV GHEX_GPU_ARCH=90 +ENV GHEX_TRANSPORT_BACKEND=MPI RUN uv sync --extra all --extra cuda12 --python=$PYVERSION From 28b1b1bbdae5a5623f941b238df8106c1165cab6 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Sun, 1 Feb 2026 21:13:08 +0100 Subject: [PATCH 16/29] Set appropriate gcc for cuda --- ci/docker/base_mpi.Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index 6d00d12db9..eb46a926a9 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -27,6 +27,7 @@ RUN apt-get update && \ libyaml-dev \ nvidia-cuda-dev \ nvidia-cuda-toolkit \ + nvidia-cuda-toolkit-gcc \ pkg-config \ python3 \ strace \ @@ -34,6 +35,10 @@ RUN apt-get update && \ wget && \ rm -rf /var/lib/apt/lists/* +ENV CC=/usr/bin/cuda-gcc +ENV CXX=/usr/bin/cuda-g++ +ENV CUDAHOSTCXX=/usr/bin/cuda-g++ + # Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use on Alps. ARG gdrcopy_version=2.5.1 RUN set -eux; \ From 73a5b5bb1bf26a2d056b66b8db73eaa1a0538441 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Sun, 1 Feb 2026 21:13:20 +0100 Subject: [PATCH 17/29] Explicitly set OpenMPI settings --- ci/distributed.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 4d4d518b58..3c978a5f69 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -38,10 +38,6 @@ build_distributed_baseimage_aarch64: DOCKERFILE: ci/docker/checkout_mpi.Dockerfile DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]' PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi - USE_MPI: NO - SLURM_MPI_TYPE: pmix - PMIX_MCA_psec: native - PMIX_MCA_gds: "^shmem2" .build_distributed_cpu: extends: [.build_distributed_template] @@ -66,6 +62,16 @@ build_distributed_cpu: ICON4PY_ENABLE_GRID_DOWNLOAD: false ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false CSCS_ADDITIONAL_MOUNTS: '["/capstor/store/cscs/userlab/d126/icon4py/ci/testdata_003:$TEST_DATA_PATH"]' + # Do not use libfabric from the host system. Libfabric with slingshot + # support is built into the container image. + USE_MPI: NO + # Use libfabric slingshot (cxi) provider and recommended settings from + # https://docs.cscs.ch/software/communication/openmpi. + SLURM_MPI_TYPE: pmix + PMIX_MCA_psec: native + FI_PROVIDER: cxi + OMPI_MCA_pml: cm + OMPI_MCA_mtl: ofi .test_distributed_aarch64: stage: test From d8e90e4fe01750202ea403fc88d3d44bdb282513 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 3 Feb 2026 11:52:51 +0100 Subject: [PATCH 18/29] Don't dlopen cuda and gdrcopy --- ci/docker/base_mpi.Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index eb46a926a9..383ffe04c9 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -96,8 +96,6 @@ RUN set -eux; \ ./autogen.sh; \ ./configure \ --with-cuda \ - --enable-cuda-dlopen \ - --enable-gdrcopy-dlopen \ --enable-xpmem=/usr \ --enable-tcp \ --enable-cxi \ From 67cfdb51d077ddbeb209e568496f6349eda4eceb Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 3 Feb 2026 13:29:24 +0100 Subject: [PATCH 19/29] Update comments and clean up options --- ci/docker/base_mpi.Dockerfile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index 383ffe04c9..f849c4d626 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -39,7 +39,9 @@ ENV CC=/usr/bin/cuda-gcc ENV CXX=/usr/bin/cuda-g++ ENV CUDAHOSTCXX=/usr/bin/cuda-g++ -# Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use on Alps. +# Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use +# on Alps. This is based on examples in +# https://github.com/eth-cscs/cray-network-stack. ARG gdrcopy_version=2.5.1 RUN set -eux; \ git clone --depth 1 --branch "v${gdrcopy_version}" https://github.com/NVIDIA/gdrcopy.git; \ @@ -88,7 +90,8 @@ RUN set -eux; \ rm -rf /xpmem; \ ldconfig -# NOTE: xpmem is not found correctly without setting the prefix in --enable-xpmem +# NOTE: xpmem is not found correctly without setting the prefix explicitly in +# --enable-xpmem ARG libfabric_version=v2.4.0 RUN set -eux; \ git clone --depth 1 --branch "${libfabric_version}" https://github.com/ofiwg/libfabric.git; \ @@ -98,9 +101,7 @@ RUN set -eux; \ --with-cuda \ --enable-xpmem=/usr \ --enable-tcp \ - --enable-cxi \ - --enable-lnx \ - --enable-shm; \ + --enable-cxi; \ make -j"$(nproc)" install; \ cd /; \ rm -rf /libfabric; \ From c81af9ebdb020011204538f8b1008d66f9e8d4f4 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 3 Feb 2026 13:29:38 +0100 Subject: [PATCH 20/29] Try ubuntu lts release for distributed ci --- ci/docker/base_mpi.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index f849c4d626..c48241855e 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:25.04 +FROM ubuntu:24.04 ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 From 790612a0ee7b1e5bd390169e7b15a3c50913d39b Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 3 Feb 2026 13:31:51 +0100 Subject: [PATCH 21/29] Set gpu binding through SLURM_GPUS_PER_TASK --- ci/distributed.yml | 1 + scripts/ci-mpi-wrapper.sh | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 3c978a5f69..c0d835e2fe 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -58,6 +58,7 @@ build_distributed_cpu: SLURM_JOB_NUM_NODES: 1 SLURM_CPU_BIND: 'verbose' SLURM_NTASKS: 4 + SLURM_GPUS_PER_TASK: 1 TEST_DATA_PATH: "/icon4py/testdata" ICON4PY_ENABLE_GRID_DOWNLOAD: false ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false diff --git a/scripts/ci-mpi-wrapper.sh b/scripts/ci-mpi-wrapper.sh index c0aa25d41f..900dd340ae 100755 --- a/scripts/ci-mpi-wrapper.sh +++ b/scripts/ci-mpi-wrapper.sh @@ -17,8 +17,6 @@ else exit 1 fi -export CUDA_VISIBLE_DEVICES="${rank}" - log_file="${CI_PROJECT_DIR:+${CI_PROJECT_DIR}/}pytest-log-rank-${rank}.txt" if [[ "${rank}" -eq 0 ]]; then From 64482e8fa1eefb5200ac3fbe78406d00e07093c9 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 3 Feb 2026 13:32:32 +0100 Subject: [PATCH 22/29] Enable all tests again --- ci/distributed.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index c0d835e2fe..d8e2a1068c 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -86,10 +86,8 @@ build_distributed_cpu: - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT parallel: matrix: - # - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common] - - COMPONENT: [common] - # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu] - BACKEND: [dace_cpu, dace_gpu] + - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common] + BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu] rules: - if: $COMPONENT == 'atmosphere/diffusion' variables: From b3eef3a6c78072d59df6009425a39fa7a6eaf24d Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 3 Feb 2026 13:33:25 +0100 Subject: [PATCH 23/29] Clean up names in distributed.yml --- ci/distributed.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index d8e2a1068c..f8600e85b1 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -39,21 +39,21 @@ build_distributed_baseimage_aarch64: DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]' PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi -.build_distributed_cpu: +.build_distributed: extends: [.build_distributed_template] variables: UV_PROJECT_ENVIRONMENT: venv_dist -build_distributed_cpu: +build_distributed: stage: image - extends: [.container-builder-cscs-gh200, .build_distributed_cpu] + extends: [.container-builder-cscs-gh200, .build_distributed] needs: [build_distributed_baseimage_aarch64] .test_template_distributed: timeout: 8h image: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi - extends: [.container-runner-santis-gh200, .build_distributed_cpu] - needs: [build_distributed_cpu] + extends: [.container-runner-santis-gh200, .build_distributed] + needs: [build_distributed] variables: SLURM_JOB_NUM_NODES: 1 SLURM_CPU_BIND: 'verbose' From d6f71d60fb49e6d92fe1a185aaf6a061a654bcc1 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 3 Feb 2026 16:02:30 +0100 Subject: [PATCH 24/29] Update base image to ubuntu 25.10 --- ci/docker/base_mpi.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile index c48241855e..a600b4ff1c 100644 --- a/ci/docker/base_mpi.Dockerfile +++ b/ci/docker/base_mpi.Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:24.04 +FROM ubuntu:25.10 ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 From 518bbdee8c8d92e884267b4fd5a157eaaac29b2e Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 4 Feb 2026 14:19:58 +0100 Subject: [PATCH 25/29] Mark distributed compute_geofac_div test embedded only, like single-rank test --- .../common/decomposition/mpi_tests/test_mpi_decomposition.py | 1 + 1 file changed, 1 insertion(+) diff --git a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py index 5bf956428d..d8f6f2aa88 100644 --- a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py +++ b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py @@ -280,6 +280,7 @@ def test_exchange_on_dummy_data( @pytest.mark.mpi @pytest.mark.datatest +@pytest.mark.embedded_only @pytest.mark.parametrize("processor_props", [False], indirect=True) def test_halo_exchange_for_sparse_field( interpolation_savepoint: serialbox.InterpolationSavepoint, From c1eed7f8cc6a57fcc7c96ce55c511fa1f4ed08eb Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 4 Feb 2026 15:19:00 +0100 Subject: [PATCH 26/29] Use philip's async-mpi branch (fixes gpu buffer stride computation) --- pyproject.toml | 2 +- uv.lock | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e349356eb7..df2c6e3d98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -361,7 +361,7 @@ url = 'https://gridtools.github.io/pypi/' [tool.uv.sources] dace = {index = "gridtools"} -ghex = {git = "https://github.com/msimberg/GHEX.git", branch = "async-mpi"} +ghex = {git = "https://github.com/philip-paul-mueller/GHEX.git", branch = "phimuell__async-mpi-2"} # gt4py = {git = "https://github.com/GridTools/gt4py", branch = "main"} # gt4py = {index = "test.pypi"} icon4py-atmosphere-advection = {workspace = true} diff --git a/uv.lock b/uv.lock index f5641ba1e4..aca8ec23cc 100644 --- a/uv.lock +++ b/uv.lock @@ -1362,7 +1362,7 @@ wheels = [ [[package]] name = "ghex" version = "0.4.1" -source = { git = "https://github.com/msimberg/GHEX.git?branch=async-mpi#6d896166994cedbcfc50da1873239a5edb212e3f" } +source = { git = "https://github.com/philip-paul-mueller/GHEX.git?branch=phimuell__async-mpi-2#80c0650fdae40bdd40e0435e5687267bada4cdd2" } dependencies = [ { name = "mpi4py" }, { name = "numpy" }, @@ -1887,7 +1887,7 @@ requires-dist = [ { name = "cupy-cuda12x", marker = "extra == 'cuda12'", specifier = ">=13.0" }, { name = "dace", specifier = "==43!2026.1.21", index = "https://gridtools.github.io/pypi/" }, { name = "datashader", marker = "extra == 'io'", specifier = ">=0.16.1" }, - { name = "ghex", marker = "extra == 'distributed'", git = "https://github.com/msimberg/GHEX.git?branch=async-mpi" }, + { name = "ghex", marker = "extra == 'distributed'", git = "https://github.com/philip-paul-mueller/GHEX.git?branch=phimuell__async-mpi-2" }, { name = "gt4py", specifier = "==1.1.3" }, { name = "gt4py", extras = ["cuda11"], marker = "extra == 'cuda11'" }, { name = "gt4py", extras = ["cuda12"], marker = "extra == 'cuda12'" }, From d08b60cf14d69dd4c3ec16e546621e64ea0d1ba9 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 4 Feb 2026 16:44:34 +0100 Subject: [PATCH 27/29] Increase time limit for distributed dace tests --- ci/distributed.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 3a262d7de0..1828a3f4ea 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -92,9 +92,9 @@ build_distributed: - if: $COMPONENT == 'atmosphere/diffusion' variables: SLURM_TIMELIMIT: '00:05:00' - - if: $COMPONENT == 'atmosphere/dycore' && $BACKEND == 'dace_cpu' + - if: $COMPONENT == 'atmosphere/dycore' && ($BACKEND == 'dace_cpu' || $BACKEND == 'dace_gpu') variables: - SLURM_TIMELIMIT: '00:20:00' + SLURM_TIMELIMIT: '00:30:00' - if: $COMPONENT == 'atmosphere/dycore' variables: SLURM_TIMELIMIT: '00:15:00' From 148850c271ccc19c6c8b333b0190c247e19cb2bd Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 4 Feb 2026 16:56:17 +0100 Subject: [PATCH 28/29] Increase time limit for distributed dace_gpu common tests --- ci/distributed.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/distributed.yml b/ci/distributed.yml index 1828a3f4ea..8c22a08611 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -95,6 +95,9 @@ build_distributed: - if: $COMPONENT == 'atmosphere/dycore' && ($BACKEND == 'dace_cpu' || $BACKEND == 'dace_gpu') variables: SLURM_TIMELIMIT: '00:30:00' + - if: $COMPONENT == 'common' && $BACKEND == 'dace_gpu' + variables: + SLURM_TIMELIMIT: '00:45:00' - if: $COMPONENT == 'atmosphere/dycore' variables: SLURM_TIMELIMIT: '00:15:00' From 0c727f58ff443cf7d049a36bb5d383d0603ec00e Mon Sep 17 00:00:00 2001 From: Jacopo Canton Date: Thu, 5 Feb 2026 12:52:25 +0100 Subject: [PATCH 29/29] sorry2 --- ci/distributed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/distributed.yml b/ci/distributed.yml index 192838a0f5..4b4038d047 100644 --- a/ci/distributed.yml +++ b/ci/distributed.yml @@ -59,7 +59,7 @@ build_distributed: SLURM_CPU_BIND: 'verbose' SLURM_NTASKS: 4 SLURM_GPUS_PER_TASK: 1 - TEST_DATA_PATH: "/icon4py/testdata" + ICON4PY_TEST_DATA_PATH: "/icon4py/testdata" ICON4PY_ENABLE_GRID_DOWNLOAD: false ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false CSCS_ADDITIONAL_MOUNTS: '["/capstor/store/cscs/userlab/cwci02/icon4py/ci/testdata:$ICON4PY_TEST_DATA_PATH"]'