From 9cade71d42a129334bdb31de88facb0c2fc34e15 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 08:24:25 +0100 Subject: [PATCH 1/8] Let's try this order. NOT WORKING: 5.7612245082855225 --- .../dace/transformations/auto_optimize.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 1d04c21fc3..92361f36b8 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -15,6 +15,7 @@ import dace from dace import data as dace_data from dace.sdfg import nodes as dace_nodes, propagation as dace_propagation, utils as dace_sdutils +from dace.transformation import dataflow as dace_dataflow from dace.transformation.auto import auto_optimize as dace_aoptimize from dace.transformation.passes import analysis as dace_analysis @@ -674,13 +675,7 @@ def _gt_auto_process_dataflow_inside_maps( time, so the compiler will fully unroll them anyway. """ - # Constants (tasklets are needed to write them into a variable) should not be - # arguments to a kernel but be present inside the body. - sdfg.apply_transformations_once_everywhere( - gtx_transformations.GT4PyMoveTaskletIntoMap, - validate=False, - validate_all=validate_all, - ) + # TODO(phimuell): Find out if needed. gtx_transformations.gt_simplify( sdfg, skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, @@ -737,6 +732,20 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) + + # Constants (tasklets are needed to write them into a variable) should not be + # arguments to a kernel but be present inside the body. + sdfg.apply_transformations_once_everywhere( + gtx_transformations.GT4PyMoveTaskletIntoMap, + validate=False, + validate_all=validate_all, + ) + return sdfg From cba099634b6cec6c9389b17780ff50b07b940211 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 08:42:36 +0100 Subject: [PATCH 2/8] Maybe this is better. DOES NOT WORK: 5.77982020 --- .../runners/dace/transformations/auto_optimize.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 92361f36b8..1fbbf82af8 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -696,6 +696,12 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) + # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called @@ -732,12 +738,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, - validate=False, - validate_all=validate_all, - ) - # Constants (tasklets are needed to write them into a variable) should not be # arguments to a kernel but be present inside the body. sdfg.apply_transformations_once_everywhere( From 8a389ee590f59c4c2bca5c9414eb232adc1b0fa1 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 08:58:36 +0100 Subject: [PATCH 3/8] Maybe the simplify call was unneeded. NOT WORKING: 5.89192s --- .../runners/dace/transformations/auto_optimize.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 1fbbf82af8..27da81cfac 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -675,14 +675,6 @@ def _gt_auto_process_dataflow_inside_maps( time, so the compiler will fully unroll them anyway. """ - # TODO(phimuell): Find out if needed. - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, - validate=False, - validate_all=validate_all, - ) - # Blocking is performed first, because this ensures that as much as possible # is moved into the k independent part. if blocking_dim is not None: From 244dc10c2c987e77d52141fec9452fbfe8806101 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 09:26:52 +0100 Subject: [PATCH 4/8] This is nearer at the empirical version, let's try it. SEEMS WORKING: 4.57165s --- .../dace/transformations/auto_optimize.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 27da81cfac..5ccaa2c182 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -694,6 +694,16 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Constants (tasklets are needed to write them into a variable) should not be + # arguments to a kernel but be present inside the body. + sdfg.apply_transformations_once_everywhere( + gtx_transformations.GT4PyMoveTaskletIntoMap, + validate=False, + validate_all=validate_all, + ) + + # TODO(phimuell): Do we need a simplify here. + # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called @@ -730,14 +740,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # Constants (tasklets are needed to write them into a variable) should not be - # arguments to a kernel but be present inside the body. - sdfg.apply_transformations_once_everywhere( - gtx_transformations.GT4PyMoveTaskletIntoMap, - validate=False, - validate_all=validate_all, - ) - return sdfg From ac2c5ce1175e4706a01e8a3aaa86d882c1dbdfdd Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 09:55:14 +0100 Subject: [PATCH 5/8] This is a bit nicer than the previous version, i.e. it has an explanation. But it also has an additional simplify that was present when TF was run in stage 1, but not in the other version. PERFORMANCE: 4.5106589s --- .../dace/transformations/auto_optimize.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 5ccaa2c182..43cb769d39 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -688,6 +688,13 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Empirical observation in MuPhys have shown that running `TaskletFusion` increases + # performance quite drastically. Thus it was added here. However, to ensure + # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are + # not mixed it must run _after_ `LoopBlocking`. Furthermore, it has been shown + # that it has to run _before_ `GT4PyMoveTaskletIntoMap`. The reasons are not + # clear but it can be measured. + # TODO(phimuell): Restrict it to Tasklets only inside Maps. sdfg.apply_transformations_repeated( dace_dataflow.TaskletFusion, validate=False, @@ -701,8 +708,13 @@ def _gt_auto_process_dataflow_inside_maps( validate=False, validate_all=validate_all, ) - - # TODO(phimuell): Do we need a simplify here. + # TODO(phimuell): figuring out if this is needed? + gtx_transformations.gt_simplify( + sdfg, + skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, + validate=False, + validate_all=validate_all, + ) # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. From d7077175397578fe50e37adf1da59a532e8a010f Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Mon, 26 Jan 2026 07:56:20 +0100 Subject: [PATCH 6/8] Updated the description. --- .../dace/transformations/auto_optimize.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 43cb769d39..c59c3532b3 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -675,8 +675,9 @@ def _gt_auto_process_dataflow_inside_maps( time, so the compiler will fully unroll them anyway. """ - # Blocking is performed first, because this ensures that as much as possible - # is moved into the k independent part. + # Separate Tasklets into dependent and independent parts to promote data + # reusability. It is important that this step has to be performed before + # `TaskletFusion` is used. if blocking_dim is not None: sdfg.apply_transformations_once_everywhere( gtx_transformations.LoopBlocking( @@ -688,13 +689,14 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # Empirical observation in MuPhys have shown that running `TaskletFusion` increases - # performance quite drastically. Thus it was added here. However, to ensure - # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are - # not mixed it must run _after_ `LoopBlocking`. Furthermore, it has been shown - # that it has to run _before_ `GT4PyMoveTaskletIntoMap`. The reasons are not - # clear but it can be measured. + # Merge Tasklets into bigger ones. + # NOTE: Empirical observation for Graupel have shown that this leads to an increase + # in performance, however, it has to be run before `GT4PyMoveTaskletIntoMap` + # (not fully clear why though, probably a compiler artefact) and as well as + # `MoveDataflowIntoIfBody` (not fully clear either, it `TaskletFusion` makes + # things simpler or prevent it from doing certain, negative, things). # TODO(phimuell): Restrict it to Tasklets only inside Maps. + # TODO(phimuell): Investigate more. sdfg.apply_transformations_repeated( dace_dataflow.TaskletFusion, validate=False, @@ -708,6 +710,7 @@ def _gt_auto_process_dataflow_inside_maps( validate=False, validate_all=validate_all, ) + # TODO(phimuell): figuring out if this is needed? gtx_transformations.gt_simplify( sdfg, @@ -729,6 +732,8 @@ def _gt_auto_process_dataflow_inside_maps( validate=False, validate_all=validate_all, ) + + # TODO(phimuell): figuring out if this is needed? gtx_transformations.gt_simplify( sdfg, skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, From 9d092c44cfe2e3881f50406dc772b1060c7747e9 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 29 Jan 2026 13:31:39 +0100 Subject: [PATCH 7/8] Added an option to disable TaskletFusion. By default it is off. --- .../dace/transformations/auto_optimize.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index c59c3532b3..a1dae67352 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -131,6 +131,7 @@ def gt_auto_optimize( assume_pointwise: bool = True, optimization_hooks: Optional[dict[GT4PyAutoOptHook, GT4PyAutoOptHookFun]] = None, demote_fields: Optional[list[str]] = None, + compact_tasklets: bool = False, validate: bool = True, validate_all: bool = False, **kwargs: Any, @@ -198,6 +199,7 @@ def gt_auto_optimize( see `GT4PyAutoOptHook` for more information. demote_fields: Consider these fields as transients for the purpose of optimization. Use at your own risk. See Notes for all implications. + compact_tasklets: Reduces the number of Tasklets by fusing them. validate: Perform validation during the steps. validate_all: Perform extensive validation. @@ -325,6 +327,7 @@ def gt_auto_optimize( blocking_only_if_independent_nodes=blocking_only_if_independent_nodes, scan_loop_unrolling=scan_loop_unrolling, scan_loop_unrolling_factor=scan_loop_unrolling_factor, + compact_tasklets=compact_tasklets, validate_all=validate_all, ) @@ -661,6 +664,7 @@ def _gt_auto_process_dataflow_inside_maps( blocking_only_if_independent_nodes: Optional[bool], scan_loop_unrolling: bool, scan_loop_unrolling_factor: int, + compact_tasklets: bool, validate_all: bool, ) -> dace.SDFG: """Optimizes the dataflow inside the top level Maps of the SDFG inplace. @@ -695,13 +699,14 @@ def _gt_auto_process_dataflow_inside_maps( # (not fully clear why though, probably a compiler artefact) and as well as # `MoveDataflowIntoIfBody` (not fully clear either, it `TaskletFusion` makes # things simpler or prevent it from doing certain, negative, things). - # TODO(phimuell): Restrict it to Tasklets only inside Maps. # TODO(phimuell): Investigate more. - sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, - validate=False, - validate_all=validate_all, - ) + # TODO(phimuell): Restrict it to Tasklets only inside Maps. + if compact_tasklets: + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) # Constants (tasklets are needed to write them into a variable) should not be # arguments to a kernel but be present inside the body. From 7a8f22b9e2a9a4891959ab0773d127e8dfcd9e4d Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 29 Jan 2026 14:26:09 +0100 Subject: [PATCH 8/8] Made the suggested renaming. --- .../runners/dace/transformations/auto_optimize.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index a1dae67352..91aba1ba4c 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -131,7 +131,7 @@ def gt_auto_optimize( assume_pointwise: bool = True, optimization_hooks: Optional[dict[GT4PyAutoOptHook, GT4PyAutoOptHookFun]] = None, demote_fields: Optional[list[str]] = None, - compact_tasklets: bool = False, + fuse_tasklets: bool = False, validate: bool = True, validate_all: bool = False, **kwargs: Any, @@ -199,7 +199,7 @@ def gt_auto_optimize( see `GT4PyAutoOptHook` for more information. demote_fields: Consider these fields as transients for the purpose of optimization. Use at your own risk. See Notes for all implications. - compact_tasklets: Reduces the number of Tasklets by fusing them. + fuse_tasklets: Reduces the number of Tasklets by fusing them. validate: Perform validation during the steps. validate_all: Perform extensive validation. @@ -327,7 +327,7 @@ def gt_auto_optimize( blocking_only_if_independent_nodes=blocking_only_if_independent_nodes, scan_loop_unrolling=scan_loop_unrolling, scan_loop_unrolling_factor=scan_loop_unrolling_factor, - compact_tasklets=compact_tasklets, + fuse_tasklets=fuse_tasklets, validate_all=validate_all, ) @@ -664,7 +664,7 @@ def _gt_auto_process_dataflow_inside_maps( blocking_only_if_independent_nodes: Optional[bool], scan_loop_unrolling: bool, scan_loop_unrolling_factor: int, - compact_tasklets: bool, + fuse_tasklets: bool, validate_all: bool, ) -> dace.SDFG: """Optimizes the dataflow inside the top level Maps of the SDFG inplace. @@ -701,7 +701,7 @@ def _gt_auto_process_dataflow_inside_maps( # things simpler or prevent it from doing certain, negative, things). # TODO(phimuell): Investigate more. # TODO(phimuell): Restrict it to Tasklets only inside Maps. - if compact_tasklets: + if fuse_tasklets: sdfg.apply_transformations_repeated( dace_dataflow.TaskletFusion, validate=False,