From 6f1c95b10f71925a50d6e1cdb18bf9ae8c7accf1 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Wed, 28 Jan 2026 08:50:02 +0100 Subject: [PATCH 1/8] Updated the CPU memory order. --- .../dace/transformations/auto_optimize.py | 19 +++++----- .../runners/dace/transformations/strides.py | 35 ++++++++++++------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 1d04c21fc3..ac2fadab09 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -762,12 +762,12 @@ def _gt_auto_configure_maps_and_strides( For a description of the arguments see the `gt_auto_optimize()` function. """ - # We now set the iteration order of the Maps. For that we use `unit_strides_kind` - # argument and if not supplied we guess depending if we are on the GPU or not. + # If no unit stride is given explicitly we assume that it is in the horizontal. + # NOTE: Before the optimizer assumed that the memory layout was different for + # GPU (horizontal first) and CPU (vertical first). However this was wrong. if unit_strides_kind is None: - unit_strides_kind = ( - gtx_common.DimensionKind.HORIZONTAL if gpu else gtx_common.DimensionKind.VERTICAL - ) + unit_strides_kind = gtx_common.DimensionKind.HORIZONTAL + # It is not possible to use the `unit_strides_dim` argument of the # function, because `LoopBlocking`, if run, changed the name of the # parameter but the dimension can still be identified by its "kind". @@ -782,11 +782,10 @@ def _gt_auto_configure_maps_and_strides( # get expanded, i.e. turned into Maps because no `cudaMemcpy*()` call exists, # which requires that the final strides are there. Furthermore, Memlet expansion # has to happen before the GPU block size is set. There are several possible - # solutions for that, of which none is really good. The one that is the least - # bad thing is to set the strides of the transients here. The main downside - # is that this and the `_gt_auto_post_processing()` function has these weird - # names. - gtx_transformations.gt_change_strides(sdfg, gpu=gpu) + # solutions for that, of which none is really good. The least bad one is to + # set the strides of the transients here. The main downside is that this and + # the `_gt_auto_post_processing()` function has these weird names. + gtx_transformations.gt_change_strides(sdfg) if gpu: # TODO(phimuell): The GPU function might modify the map iteration order. diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py index 928fa04d54..84fc2ee14c 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py @@ -33,37 +33,45 @@ def gt_change_strides( sdfg: dace.SDFG, - gpu: bool, ) -> dace.SDFG: """Modifies the strides of transients. The function will analyse the access patterns and set the strides of - transients in the optimal way. - The function should run after all maps have been created. + transients in the optimal way. The function should run after _all_ + Maps have been created. + After the adjustment of the strides they will be propagated into the nested + SDFGs, see `gt_propagate_strides_of()` for more. - After the strides have been adjusted the function will also propagate - the strides into nested SDFG, see `gt_propagate_strides_of()` for more. Args: sdfg: The SDFG to process. - gpu: If the SDFG is supposed to run on the GPU. Note: Currently the function will not scan the access pattern. Instead it will - either use FORTRAN order for GPU or C order. This function needs to be called + translate the memory layout such that the horizontal dimension has stride 1, + which is used by the GT4Py allocator. This function needs to be called for both CPU and GPU to handle strides of memlets inside nested SDFGs. Todo: - - Implement the estimation correctly. + - Update this function such that the memory order is computed based on the + access pattern. Probably also merge it with `gt_set_iteration_order()` + function as the task are related. + - Im """ - # TODO(phimeull): Implement this function correctly. + # TODO(phimeull): Implement this function correctly, such that it decides the + # order based on the access pattern. Probably also merge it with + # `gt_set_iteration_order()` as the two things are related. + # NOTE: This function builds on the fact that in GT4Py the horizontal dimension + # is always the first dimensions, i.e. column or FORTRAN order and that in + # DaCe the default order (which the lowering uses), is row or C order. + # Thus we just have to inverse the order for all transients and propagate + # the new strides. for nsdfg in sdfg.all_sdfgs_recursive(): - _gt_change_strides_non_recursive_impl(nsdfg, gpu) + _gt_change_strides_non_recursive_impl(nsdfg) def _gt_change_strides_non_recursive_impl( sdfg: dace.SDFG, - gpu: bool, ) -> None: """Set optimal strides of all access nodes in the SDFG. @@ -103,7 +111,7 @@ def _gt_change_strides_non_recursive_impl( # access nodes because the non-transients come from outside and have their # own strides. # TODO(phimuell): Set the stride based on the actual access pattern. - if desc.transient and gpu: + if desc.transient: new_stride_order = list(range(ndim)) desc.set_strides_from_layout(*new_stride_order) @@ -124,7 +132,8 @@ def _gt_change_strides_non_recursive_impl( ) # Now handle the views. - # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can handle views. + # TODO(phimuell): Remove once `gt_propagate_strides_from_access_node()` can + # handle views. However, we should get to a point where we do not have views. _gt_modify_strides_of_views_non_recursive(sdfg) From 2027ad6dbe498c63bf7bdd6d3ff71e272c5ddc42 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Wed, 28 Jan 2026 09:01:06 +0100 Subject: [PATCH 2/8] Made some additional notes. --- .../runners/dace/transformations/strides.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py index 84fc2ee14c..c99e9b79a2 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py @@ -50,16 +50,15 @@ def gt_change_strides( translate the memory layout such that the horizontal dimension has stride 1, which is used by the GT4Py allocator. This function needs to be called for both CPU and GPU to handle strides of memlets inside nested SDFGs. - - Todo: - - Update this function such that the memory order is computed based on the - access pattern. Probably also merge it with `gt_set_iteration_order()` - function as the task are related. - - Im + Furthermore, the current implementation assumes that there is only one + horizontal dimension. """ # TODO(phimeull): Implement this function correctly, such that it decides the # order based on the access pattern. Probably also merge it with # `gt_set_iteration_order()` as the two things are related. + # TODO(phimuell): The current implementation assumes that there is only one + # horizontal dimension. If there are multiple horizontal ones then we might + # have a problem. # NOTE: This function builds on the fact that in GT4Py the horizontal dimension # is always the first dimensions, i.e. column or FORTRAN order and that in # DaCe the default order (which the lowering uses), is row or C order. From 6179a6adb9d2762a1a508c34778db48a15e8ec11 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Wed, 28 Jan 2026 15:45:25 +0100 Subject: [PATCH 3/8] Updated the description and naming a bit. --- .../runners/dace/transformations/auto_optimize.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index ac2fadab09..82750703b8 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -763,17 +763,19 @@ def _gt_auto_configure_maps_and_strides( """ # If no unit stride is given explicitly we assume that it is in the horizontal. - # NOTE: Before the optimizer assumed that the memory layout was different for - # GPU (horizontal first) and CPU (vertical first). However this was wrong. - if unit_strides_kind is None: - unit_strides_kind = gtx_common.DimensionKind.HORIZONTAL + # This has also technical reasons to avoid launch errors (on GPU we have to make + # sure that the biggest dimension ends up on the `x` direction, which is most + # likely the horizontal dimension). + prime_direction_kind = ( + gtx_common.DimensionKind.HORIZONTAL if unit_strides_kind is None else unit_strides_kind + ) # It is not possible to use the `unit_strides_dim` argument of the # function, because `LoopBlocking`, if run, changed the name of the # parameter but the dimension can still be identified by its "kind". gtx_transformations.gt_set_iteration_order( sdfg=sdfg, - unit_strides_kind=unit_strides_kind, + unit_strides_kind=prime_direction_kind, validate=False, validate_all=validate_all, ) From 88ac3babc838bec85f8b36ea8613fa4301eff531 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 29 Jan 2026 07:56:12 +0100 Subject: [PATCH 4/8] Changed the selection of the leading kind and also clarified on the description. If the leading kind is not known then it will not reorder strides nor the iteration order. However, for cetain reasons (launch errors) we have to set one for GPU in that case. --- .../dace/transformations/auto_optimize.py | 70 ++++++++++++------- .../runners/dace/transformations/gpu_utils.py | 1 + .../runners/dace/transformations/strides.py | 62 +++++++++------- 3 files changed, 84 insertions(+), 49 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 82750703b8..5116f72075 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -762,34 +762,56 @@ def _gt_auto_configure_maps_and_strides( For a description of the arguments see the `gt_auto_optimize()` function. """ - # If no unit stride is given explicitly we assume that it is in the horizontal. - # This has also technical reasons to avoid launch errors (on GPU we have to make - # sure that the biggest dimension ends up on the `x` direction, which is most - # likely the horizontal dimension). - prime_direction_kind = ( - gtx_common.DimensionKind.HORIZONTAL if unit_strides_kind is None else unit_strides_kind - ) - - # It is not possible to use the `unit_strides_dim` argument of the - # function, because `LoopBlocking`, if run, changed the name of the - # parameter but the dimension can still be identified by its "kind". - gtx_transformations.gt_set_iteration_order( - sdfg=sdfg, - unit_strides_kind=prime_direction_kind, - validate=False, - validate_all=validate_all, - ) + # If `unit_strides_kind` is unknown we will not modify the Map order nor the + # strides, except if we are on GPU. The reason for this is that the maximal + # number of blocks is different for each dimension. If the largest dimension + # is for example associated with the `z` dimension, we would get launch errors + # at some point. Thus in that case we pretend that it is horizontal. Which is + # a valid assumption for any ICON-like code or if the GT4Py allocator is used. + # TODO(phimuell): Make this selection more intelligent. + if unit_strides_kind is None and gpu: + prefered_direction_kind: Optional[gtx_common.DimensionKind] = ( + gtx_common.DimensionKind.HORIZONTAL + ) + else: + prefered_direction_kind = unit_strides_kind + + # We should actually use a `gtx.Dimension` here and not a `gtx.DimensionKind`, + # since they are unique. However at this stage, especially after the expansion + # of non standard Memlets (which happens in the GPU transformation) associating + # Map parameters with GT4Py dimension is very hard to impossible. At this stage + # the kind is the most reliable indicator we have. + # NOTE: This is not the only location where we manipulate the Map order, we also + # do it in the GPU transformation, where we have to set the order of the + # expanded Memlets. + if prefered_direction_kind is not None: + gtx_transformations.gt_set_iteration_order( + sdfg=sdfg, + unit_strides_kind=prefered_direction_kind, + validate=False, + validate_all=validate_all, + ) # NOTE: We have to set the strides of transients before the non-standard Memlets - # get expanded, i.e. turned into Maps because no `cudaMemcpy*()` call exists, - # which requires that the final strides are there. Furthermore, Memlet expansion - # has to happen before the GPU block size is set. There are several possible - # solutions for that, of which none is really good. The least bad one is to - # set the strides of the transients here. The main downside is that this and - # the `_gt_auto_post_processing()` function has these weird names. - gtx_transformations.gt_change_strides(sdfg) + # get expanded, i.e. turned into Maps because no matching `cudaMemcpy*()` call + # exists, which requires that the final strides are there. Furthermore, Memlet + # expansion has to happen before the GPU block size is set. There are several + # possible solutions for that, of which none is really good. The least bad one + # is to set the strides of the transients here. The main downside is that we + # slightly modify the SDFG in the GPU transformation after we have set the + # strides. + if prefered_direction_kind is not None: + gtx_transformations.gt_change_strides(sdfg, prefered_direction_kind=prefered_direction_kind) if gpu: + if unit_strides_kind != gtx_common.DimensionKind.HORIZONTAL: + warnings.warn( + "The GT4Py DaCe backend assumes that in GPU mode the leading dimension" + f" is horizontal, but it was '{unit_strides_kind}', this might lead" + " to suboptimal performance", + stacklevel=2, + ) + # TODO(phimuell): The GPU function might modify the map iteration order. # This is because how it is implemented (promotion and fusion). However, # because of its current state, this should not happen, but we have to look diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py index 1786913edb..ac81d2cd64 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py @@ -241,6 +241,7 @@ def restrict_fusion_to_newly_created_maps_horizontal( if len(maps_to_modify) == 0: return sdfg + # NOTE: This inherently assumes a particular memory order, see `gt_change_strides()`. for me_to_modify in maps_to_modify: map_to_modify: dace_nodes.Map = me_to_modify.map map_to_modify.params = list(reversed(map_to_modify.params)) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py index c99e9b79a2..0840f77755 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/strides.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/strides.py @@ -12,6 +12,7 @@ from dace import data as dace_data from dace.sdfg import nodes as dace_nodes +from gt4py.next import common as gtx_common from gt4py.next.program_processors.runners.dace import ( sdfg_args as gtx_dace_args, transformations as gtx_transformations, @@ -33,6 +34,7 @@ def gt_change_strides( sdfg: dace.SDFG, + prefered_direction_kind: gtx_common.DimensionKind, ) -> dace.SDFG: """Modifies the strides of transients. @@ -44,35 +46,50 @@ def gt_change_strides( Args: sdfg: The SDFG to process. + prefered_direction_kind: `DimensionKind` of the dimension with stride 1. Note: - Currently the function will not scan the access pattern. Instead it will - translate the memory layout such that the horizontal dimension has stride 1, - which is used by the GT4Py allocator. This function needs to be called - for both CPU and GPU to handle strides of memlets inside nested SDFGs. - Furthermore, the current implementation assumes that there is only one - horizontal dimension. + - This function should be run after `gt_set_iteration_order()` has been run. + - Currently the function will not scan the access pattern. Instead it relies + on the default behaviour of the lowering and how the GT4Py allocator works. + - The current implementation assumes that there is only one dimension of the + given kind. """ # TODO(phimeull): Implement this function correctly, such that it decides the # order based on the access pattern. Probably also merge it with # `gt_set_iteration_order()` as the two things are related. - # TODO(phimuell): The current implementation assumes that there is only one - # horizontal dimension. If there are multiple horizontal ones then we might - # have a problem. - # NOTE: This function builds on the fact that in GT4Py the horizontal dimension - # is always the first dimensions, i.e. column or FORTRAN order and that in - # DaCe the default order (which the lowering uses), is row or C order. - # Thus we just have to inverse the order for all transients and propagate - # the new strides. - for nsdfg in sdfg.all_sdfgs_recursive(): - _gt_change_strides_non_recursive_impl(nsdfg) + # NOTE: This function builds inherently assumes the dimension order defined by + # `gtx_common.order_dimensions()`, the default behaviour of the lowering, + # partially how the GT4Py allocator works and that there is only one dimension + # of any kind (which is true for ICON4Py, but not in general, for example + # in Cartesian grids). Its base assumption is that the ordering (Map parameters + # and strides) generated by the lowering "out of the box" are in row major/C + # order. Because of the GT4Py dimension order this is the right order for + # `gtx_common.DimensionKind.VERTICAL`. If the primary direction kind is + # `HORIZONTAL`, then according to the GT4Py dimension order column major/FORTRAN + # order should be used. To get there we have to reverse the strides order, which + # `_gt_change_strides_non_recursive_impl()` does. This is very brittle but at + # this point the best thing we can do. + + match prefered_direction_kind: + case gtx_common.DimensionKind.VERTICAL: + return # Nothing to do in that case. Maybe run Memlet propagation here? + + case gtx_common.DimensionKind.HORIZONTAL: + for nsdfg in sdfg.all_sdfgs_recursive(): + _gt_change_strides_non_recursive_impl(nsdfg) + + case _: + raise ValueError( + f"Encountered unknown `DimensionKind` value: {prefered_direction_kind}" + ) def _gt_change_strides_non_recursive_impl( sdfg: dace.SDFG, ) -> None: - """Set optimal strides of all access nodes in the SDFG. + """Set "optimal" strides of all access nodes in the SDFG. The function will look for all top level access node, see `_gt_find_toplevel_data_accesses()` and set their strides such that the access is optimal, see Note. The function @@ -81,14 +98,9 @@ def _gt_change_strides_non_recursive_impl( This function should never be called directly but always through `gt_change_strides()`! Note: - Currently the function just reverses the strides of the data descriptor - of transient access nodes it processes. Since DaCe generates `C` order by default - this lead to FORTRAN order, which is (for now) sufficient to optimize the memory - layout to GPU. - - Todo: - Make this function more intelligent to analyse the access pattern and then - figuring out the best order. + This function has the same underlying assumption as they are outlined in + `gt_change_strides()`, see there from more informations about the underlying + assumptions and limitations. """ # NOTE: We have to process all access nodes (transient and globals). If we are inside a # NestedSDFG then they were handled before on the level above us. From f390cff3e52659114469f5952697ce22c4177647 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 29 Jan 2026 10:09:54 +0100 Subject: [PATCH 5/8] Added a compatibility layer for ICON4Py. --- src/gt4py/next/metrics.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 src/gt4py/next/metrics.py diff --git a/src/gt4py/next/metrics.py b/src/gt4py/next/metrics.py new file mode 100644 index 0000000000..735465a81a --- /dev/null +++ b/src/gt4py/next/metrics.py @@ -0,0 +1,12 @@ +# GT4Py - GridTools Framework +# +# Copyright (c) 2014-2024, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +# Needed for compatibility with ICON4Py +from gt4py.next.instrumentation.metrics import * # noqa: F403 [undefined-local-with-import-star] From 080c669d20e929cd9884b5171dc5d877f4e9f6af Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 29 Jan 2026 10:52:04 +0100 Subject: [PATCH 6/8] Updated the warning. --- .../runners/dace/transformations/auto_optimize.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 5116f72075..a32c446799 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -806,9 +806,10 @@ def _gt_auto_configure_maps_and_strides( if gpu: if unit_strides_kind != gtx_common.DimensionKind.HORIZONTAL: warnings.warn( - "The GT4Py DaCe backend assumes that in GPU mode the leading dimension" - f" is horizontal, but it was '{unit_strides_kind}', this might lead" - " to suboptimal performance", + "The GT4Py DaCe GPU backend assumes that the leading dimension, i.e." + " where stride is 1, is of kind 'HORIZONTAL', however it was" + f" '{unit_strides_kind}' and is the last index. Other configurations" + " might lead to suboptimal performance.", stacklevel=2, ) From ad3a0489519f37cf3efe29aa017963e99b1a6479 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 29 Jan 2026 14:19:16 +0100 Subject: [PATCH 7/8] Removed the compatibility hack. --- src/gt4py/next/metrics.py | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 src/gt4py/next/metrics.py diff --git a/src/gt4py/next/metrics.py b/src/gt4py/next/metrics.py deleted file mode 100644 index 735465a81a..0000000000 --- a/src/gt4py/next/metrics.py +++ /dev/null @@ -1,12 +0,0 @@ -# GT4Py - GridTools Framework -# -# Copyright (c) 2014-2024, ETH Zurich -# All rights reserved. -# -# Please, refer to the LICENSE file in the root directory. -# SPDX-License-Identifier: BSD-3-Clause - -from __future__ import annotations - -# Needed for compatibility with ICON4Py -from gt4py.next.instrumentation.metrics import * # noqa: F403 [undefined-local-with-import-star] From 458f99801ac883f332166e8265c10a0ddec939f7 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Fri, 30 Jan 2026 08:48:12 +0100 Subject: [PATCH 8/8] Correction. --- .../runners/dace/transformations/auto_optimize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index a32c446799..80e43711d3 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -808,8 +808,8 @@ def _gt_auto_configure_maps_and_strides( warnings.warn( "The GT4Py DaCe GPU backend assumes that the leading dimension, i.e." " where stride is 1, is of kind 'HORIZONTAL', however it was" - f" '{unit_strides_kind}' and is the last index. Other configurations" - " might lead to suboptimal performance.", + f" '{unit_strides_kind}'. Furthermore, it should be the last dimension." + " Other configurations might lead to suboptimal performance.", stacklevel=2, )