From 9cade71d42a129334bdb31de88facb0c2fc34e15 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 08:24:25 +0100 Subject: [PATCH 01/20] Let's try this order. NOT WORKING: 5.7612245082855225 --- .../dace/transformations/auto_optimize.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 1d04c21fc3..92361f36b8 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -15,6 +15,7 @@ import dace from dace import data as dace_data from dace.sdfg import nodes as dace_nodes, propagation as dace_propagation, utils as dace_sdutils +from dace.transformation import dataflow as dace_dataflow from dace.transformation.auto import auto_optimize as dace_aoptimize from dace.transformation.passes import analysis as dace_analysis @@ -674,13 +675,7 @@ def _gt_auto_process_dataflow_inside_maps( time, so the compiler will fully unroll them anyway. """ - # Constants (tasklets are needed to write them into a variable) should not be - # arguments to a kernel but be present inside the body. - sdfg.apply_transformations_once_everywhere( - gtx_transformations.GT4PyMoveTaskletIntoMap, - validate=False, - validate_all=validate_all, - ) + # TODO(phimuell): Find out if needed. gtx_transformations.gt_simplify( sdfg, skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, @@ -737,6 +732,20 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) + + # Constants (tasklets are needed to write them into a variable) should not be + # arguments to a kernel but be present inside the body. + sdfg.apply_transformations_once_everywhere( + gtx_transformations.GT4PyMoveTaskletIntoMap, + validate=False, + validate_all=validate_all, + ) + return sdfg From cba099634b6cec6c9389b17780ff50b07b940211 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 08:42:36 +0100 Subject: [PATCH 02/20] Maybe this is better. DOES NOT WORK: 5.77982020 --- .../runners/dace/transformations/auto_optimize.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 92361f36b8..1fbbf82af8 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -696,6 +696,12 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) + # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called @@ -732,12 +738,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, - validate=False, - validate_all=validate_all, - ) - # Constants (tasklets are needed to write them into a variable) should not be # arguments to a kernel but be present inside the body. sdfg.apply_transformations_once_everywhere( From 8a389ee590f59c4c2bca5c9414eb232adc1b0fa1 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 08:58:36 +0100 Subject: [PATCH 03/20] Maybe the simplify call was unneeded. NOT WORKING: 5.89192s --- .../runners/dace/transformations/auto_optimize.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 1fbbf82af8..27da81cfac 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -675,14 +675,6 @@ def _gt_auto_process_dataflow_inside_maps( time, so the compiler will fully unroll them anyway. """ - # TODO(phimuell): Find out if needed. - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, - validate=False, - validate_all=validate_all, - ) - # Blocking is performed first, because this ensures that as much as possible # is moved into the k independent part. if blocking_dim is not None: From 244dc10c2c987e77d52141fec9452fbfe8806101 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 09:26:52 +0100 Subject: [PATCH 04/20] This is nearer at the empirical version, let's try it. SEEMS WORKING: 4.57165s --- .../dace/transformations/auto_optimize.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 27da81cfac..5ccaa2c182 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -694,6 +694,16 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Constants (tasklets are needed to write them into a variable) should not be + # arguments to a kernel but be present inside the body. + sdfg.apply_transformations_once_everywhere( + gtx_transformations.GT4PyMoveTaskletIntoMap, + validate=False, + validate_all=validate_all, + ) + + # TODO(phimuell): Do we need a simplify here. + # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called @@ -730,14 +740,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # Constants (tasklets are needed to write them into a variable) should not be - # arguments to a kernel but be present inside the body. - sdfg.apply_transformations_once_everywhere( - gtx_transformations.GT4PyMoveTaskletIntoMap, - validate=False, - validate_all=validate_all, - ) - return sdfg From ac2c5ce1175e4706a01e8a3aaa86d882c1dbdfdd Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 09:55:14 +0100 Subject: [PATCH 05/20] This is a bit nicer than the previous version, i.e. it has an explanation. But it also has an additional simplify that was present when TF was run in stage 1, but not in the other version. PERFORMANCE: 4.5106589s --- .../dace/transformations/auto_optimize.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 5ccaa2c182..43cb769d39 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -688,6 +688,13 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Empirical observation in MuPhys have shown that running `TaskletFusion` increases + # performance quite drastically. Thus it was added here. However, to ensure + # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are + # not mixed it must run _after_ `LoopBlocking`. Furthermore, it has been shown + # that it has to run _before_ `GT4PyMoveTaskletIntoMap`. The reasons are not + # clear but it can be measured. + # TODO(phimuell): Restrict it to Tasklets only inside Maps. sdfg.apply_transformations_repeated( dace_dataflow.TaskletFusion, validate=False, @@ -701,8 +708,13 @@ def _gt_auto_process_dataflow_inside_maps( validate=False, validate_all=validate_all, ) - - # TODO(phimuell): Do we need a simplify here. + # TODO(phimuell): figuring out if this is needed? + gtx_transformations.gt_simplify( + sdfg, + skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, + validate=False, + validate_all=validate_all, + ) # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. From b68bae55bf3dae2314f8ebf956c0ef6ef3d78e70 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 10:12:37 +0100 Subject: [PATCH 06/20] An experiment, let's see what happens. DOES NOT WORK: 5.90685105s --- .../dace/transformations/auto_optimize.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 43cb769d39..05eb840aea 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -688,6 +688,26 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Move dataflow into the branches of the `if` such that they are only evaluated + # if they are needed. Important to call it repeatedly. + # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called + # before or after `LoopBlocking`. In cases where the condition is `False` + # most of the times calling it before is better, but if the condition is + # `True` then this order is better. Solve that issue. + sdfg.apply_transformations_repeated( + gtx_transformations.MoveDataflowIntoIfBody( + ignore_upstream_blocks=False, + ), + validate=False, + validate_all=validate_all, + ) + gtx_transformations.gt_simplify( + sdfg, + skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, + validate=False, + validate_all=validate_all, + ) + # Empirical observation in MuPhys have shown that running `TaskletFusion` increases # performance quite drastically. Thus it was added here. However, to ensure # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are @@ -716,26 +736,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # Move dataflow into the branches of the `if` such that they are only evaluated - # if they are needed. Important to call it repeatedly. - # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called - # before or after `LoopBlocking`. In cases where the condition is `False` - # most of the times calling it before is better, but if the condition is - # `True` then this order is better. Solve that issue. - sdfg.apply_transformations_repeated( - gtx_transformations.MoveDataflowIntoIfBody( - ignore_upstream_blocks=False, - ), - validate=False, - validate_all=validate_all, - ) - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, - validate=False, - validate_all=validate_all, - ) - # After some transformation we see in the SDFG that there are pointwise views # generated after reduction nodes. These views are unecessary and might produce # sub optimal GPU code thus we remove them. From 8211e57e273304fb1178b832b9c09392f2723272 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 11:31:37 +0100 Subject: [PATCH 07/20] Revert "An experiment, let's see what happens." This reverts commit b68bae55bf3dae2314f8ebf956c0ef6ef3d78e70. --- .../dace/transformations/auto_optimize.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 05eb840aea..43cb769d39 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -688,26 +688,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # Move dataflow into the branches of the `if` such that they are only evaluated - # if they are needed. Important to call it repeatedly. - # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called - # before or after `LoopBlocking`. In cases where the condition is `False` - # most of the times calling it before is better, but if the condition is - # `True` then this order is better. Solve that issue. - sdfg.apply_transformations_repeated( - gtx_transformations.MoveDataflowIntoIfBody( - ignore_upstream_blocks=False, - ), - validate=False, - validate_all=validate_all, - ) - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, - validate=False, - validate_all=validate_all, - ) - # Empirical observation in MuPhys have shown that running `TaskletFusion` increases # performance quite drastically. Thus it was added here. However, to ensure # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are @@ -736,6 +716,26 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Move dataflow into the branches of the `if` such that they are only evaluated + # if they are needed. Important to call it repeatedly. + # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called + # before or after `LoopBlocking`. In cases where the condition is `False` + # most of the times calling it before is better, but if the condition is + # `True` then this order is better. Solve that issue. + sdfg.apply_transformations_repeated( + gtx_transformations.MoveDataflowIntoIfBody( + ignore_upstream_blocks=False, + ), + validate=False, + validate_all=validate_all, + ) + gtx_transformations.gt_simplify( + sdfg, + skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, + validate=False, + validate_all=validate_all, + ) + # After some transformation we see in the SDFG that there are pointwise views # generated after reduction nodes. These views are unecessary and might produce # sub optimal GPU code thus we remove them. From aa6afa2928abe7c422f9a4dc8dea9ba6cc6f3a4b Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 11:39:01 +0100 Subject: [PATCH 08/20] New experiment with a different inliner. The inliner ignores Tasklets with empty Memlets and the PR also moved the `TaskletIntoMap` before the TF. This is like the version that did not worked, i.e. was super fast, but with the new TF we should recover it. WORKING: 4.604942321777344s --- .../dace/transformations/auto_optimize.py | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 43cb769d39..f90c8ee42c 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -654,6 +654,29 @@ def _gt_auto_process_top_level_maps( return sdfg +class TaskletFusion2(dace_dataflow.TaskletFusion): + """ + Implementation for experiment, that ignores any Tasklet that has an empty Memlet. + Just for experimental reasons. + """ + + def can_be_applied( + self, + graph: dace.SDFGState, + expr_index: int, + sdfg: dace.SDFG, + permissive: bool = False, + ) -> bool: + if any( + e.data.is_empty() + for e in list(graph.in_edges(self.t1)) + list(graph.out_edges(self.t1)) + ): + return False + return super().can_be_applied( + graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive + ) + + def _gt_auto_process_dataflow_inside_maps( sdfg: dace.SDFG, blocking_dim: Optional[gtx_common.Dimension], @@ -688,6 +711,14 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Constants (tasklets are needed to write them into a variable) should not be + # arguments to a kernel but be present inside the body. + sdfg.apply_transformations_once_everywhere( + gtx_transformations.GT4PyMoveTaskletIntoMap, + validate=False, + validate_all=validate_all, + ) + # Empirical observation in MuPhys have shown that running `TaskletFusion` increases # performance quite drastically. Thus it was added here. However, to ensure # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are @@ -696,18 +727,11 @@ def _gt_auto_process_dataflow_inside_maps( # clear but it can be measured. # TODO(phimuell): Restrict it to Tasklets only inside Maps. sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, + TaskletFusion2, validate=False, validate_all=validate_all, ) - # Constants (tasklets are needed to write them into a variable) should not be - # arguments to a kernel but be present inside the body. - sdfg.apply_transformations_once_everywhere( - gtx_transformations.GT4PyMoveTaskletIntoMap, - validate=False, - validate_all=validate_all, - ) # TODO(phimuell): figuring out if this is needed? gtx_transformations.gt_simplify( sdfg, From 03581a572dd9b08cf4c165d456f049a53bfe07f5 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 13:23:18 +0100 Subject: [PATCH 09/20] Revert "New experiment with a different inliner." This reverts commit aa6afa2928abe7c422f9a4dc8dea9ba6cc6f3a4b. --- .../dace/transformations/auto_optimize.py | 40 ++++--------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index f90c8ee42c..43cb769d39 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -654,29 +654,6 @@ def _gt_auto_process_top_level_maps( return sdfg -class TaskletFusion2(dace_dataflow.TaskletFusion): - """ - Implementation for experiment, that ignores any Tasklet that has an empty Memlet. - Just for experimental reasons. - """ - - def can_be_applied( - self, - graph: dace.SDFGState, - expr_index: int, - sdfg: dace.SDFG, - permissive: bool = False, - ) -> bool: - if any( - e.data.is_empty() - for e in list(graph.in_edges(self.t1)) + list(graph.out_edges(self.t1)) - ): - return False - return super().can_be_applied( - graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive - ) - - def _gt_auto_process_dataflow_inside_maps( sdfg: dace.SDFG, blocking_dim: Optional[gtx_common.Dimension], @@ -711,14 +688,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # Constants (tasklets are needed to write them into a variable) should not be - # arguments to a kernel but be present inside the body. - sdfg.apply_transformations_once_everywhere( - gtx_transformations.GT4PyMoveTaskletIntoMap, - validate=False, - validate_all=validate_all, - ) - # Empirical observation in MuPhys have shown that running `TaskletFusion` increases # performance quite drastically. Thus it was added here. However, to ensure # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are @@ -727,11 +696,18 @@ def _gt_auto_process_dataflow_inside_maps( # clear but it can be measured. # TODO(phimuell): Restrict it to Tasklets only inside Maps. sdfg.apply_transformations_repeated( - TaskletFusion2, + dace_dataflow.TaskletFusion, validate=False, validate_all=validate_all, ) + # Constants (tasklets are needed to write them into a variable) should not be + # arguments to a kernel but be present inside the body. + sdfg.apply_transformations_once_everywhere( + gtx_transformations.GT4PyMoveTaskletIntoMap, + validate=False, + validate_all=validate_all, + ) # TODO(phimuell): figuring out if this is needed? gtx_transformations.gt_simplify( sdfg, From c40417421324dc274d3ce5fa01db21fa3b1b754e Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 13:26:17 +0100 Subject: [PATCH 10/20] Now let's check what happens if we at the end of the fast wersion run Tasklet fusion again. I have noticed that in the normal/fast version the Tasklets with the literals are not inlined into other Tasklets. Now let's do that. DOES NOT WORK: 6.070111036300659s --- .../runners/dace/transformations/auto_optimize.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 43cb769d39..fdc03c0e5f 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -752,6 +752,13 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Let's continue to see what happens if we now inline again. + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) + return sdfg From cef7ebb87a1d922b747bc8d40310c73118737fbf Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 14:05:34 +0100 Subject: [PATCH 11/20] Revert "Now let's check what happens if we at the end of the fast wersion run Tasklet fusion again." This reverts commit c40417421324dc274d3ce5fa01db21fa3b1b754e. WORKS: 4.4253344535s --- .../runners/dace/transformations/auto_optimize.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index fdc03c0e5f..43cb769d39 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -752,13 +752,6 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # Let's continue to see what happens if we now inline again. - sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, - validate=False, - validate_all=validate_all, - ) - return sdfg From a17e6bf2f292fe707101fcc6be889440e9b5a7d3 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 14:20:02 +0100 Subject: [PATCH 12/20] Essentially the same experiment as in `c40417421324dc274d3ce5fa01db21fa3b1b754e`. But this time it is run right after the inlining. I think there will be a next iteration. DOWS NOT WORK: 6.1715042 --- .../runners/dace/transformations/auto_optimize.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 43cb769d39..812dd229c4 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -729,6 +729,14 @@ def _gt_auto_process_dataflow_inside_maps( validate=False, validate_all=validate_all, ) + + # We also see a slowdown if this is at the end. Let's see if it also work here. + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) + gtx_transformations.gt_simplify( sdfg, skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, From dcc4b5efdb7c637fe542caf5fe47dada3b4a5f83 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 14:32:57 +0100 Subject: [PATCH 13/20] This version should be fast, but as soon as we know we will enabled the currently disacbled code and see if it is slow. As expected it is kind of fast. WORKS: 4.620257 --- .../dace/transformations/auto_optimize.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 812dd229c4..5522236a4a 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -708,13 +708,6 @@ def _gt_auto_process_dataflow_inside_maps( validate=False, validate_all=validate_all, ) - # TODO(phimuell): figuring out if this is needed? - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, - validate=False, - validate_all=validate_all, - ) # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. @@ -731,11 +724,11 @@ def _gt_auto_process_dataflow_inside_maps( ) # We also see a slowdown if this is at the end. Let's see if it also work here. - sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, - validate=False, - validate_all=validate_all, - ) + #sdfg.apply_transformations_repeated( + # dace_dataflow.TaskletFusion, + # validate=False, + # validate_all=validate_all, + #) gtx_transformations.gt_simplify( sdfg, From 1047b06ff0151ec15a426b694cd9f3f1cfa1d860 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 14:44:48 +0100 Subject: [PATCH 14/20] Now let's activate the second FT pass. I would have expected that it is slow now but it is not. WORKS: 4.66531s --- .../runners/dace/transformations/auto_optimize.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 5522236a4a..340d9220c7 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -724,11 +724,11 @@ def _gt_auto_process_dataflow_inside_maps( ) # We also see a slowdown if this is at the end. Let's see if it also work here. - #sdfg.apply_transformations_repeated( - # dace_dataflow.TaskletFusion, - # validate=False, - # validate_all=validate_all, - #) + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) gtx_transformations.gt_simplify( sdfg, From 0bffbcce44aab89ab0b8e060c5e8994821f229c3 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 22 Jan 2026 15:09:45 +0100 Subject: [PATCH 15/20] New experiment with running it twice. This seems to do the trick. If between the `MoveDataflowIntoIf` and the second `TaskletFusion` is a call to simplify, it is slow. DOES NOT WORK: 6.2248911s --- .../dace/transformations/auto_optimize.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 340d9220c7..a91afde0f6 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -723,19 +723,21 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # We also see a slowdown if this is at the end. Let's see if it also work here. - sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, - validate=False, - validate_all=validate_all, - ) - + # We see slowdowns if at the end is a call to the TF transformation, see commit + # `c40417421324dc274d3ce5fa01db21fa3b1b754e`. But if we put it just after + # `MoveDataflowIntoIfBody` then nothing happens. We will now try to replacate + # the earlier experiment by putting a simplify call in between. gtx_transformations.gt_simplify( sdfg, skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, validate=False, validate_all=validate_all, ) + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) # After some transformation we see in the SDFG that there are pointwise views # generated after reduction nodes. These views are unecessary and might produce From ebf9ce76ba655e52fa70af890c9e2c60fb51d5fb Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Fri, 23 Jan 2026 08:16:12 +0100 Subject: [PATCH 16/20] Apparently experiment `1047b06ff0151ec15a426b694cd9f3f1cfa1d860` was somehow executed in a wrong way. In that experiment I wanted to run TF a second time after everything is done. It was fast, which puzzeled me, because it contradicted by previous findings. However, I looked at the archived SDFG and at least that thing showed that TF was not run a second time. Thus I have decided to redo that part, so now let's run TF for a second time and see what happens. SLOW: 6.137449s --- .../runners/dace/transformations/auto_optimize.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index a91afde0f6..54e80fa2b3 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -723,16 +723,11 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) - # We see slowdowns if at the end is a call to the TF transformation, see commit - # `c40417421324dc274d3ce5fa01db21fa3b1b754e`. But if we put it just after - # `MoveDataflowIntoIfBody` then nothing happens. We will now try to replacate - # the earlier experiment by putting a simplify call in between. - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, - validate=False, - validate_all=validate_all, - ) + # Apparently there was an error/mistake when I did the original `1047b06ff` + # experiment. In that experiment I placed here another call to TF. Back then + # it appeared to be fast, which was super strange. It then checked the archived + # SDFG and it was indicating that TF was not run for a second time. Thus + # I have to redo the experiment which is done here. sdfg.apply_transformations_repeated( dace_dataflow.TaskletFusion, validate=False, From 42ecba3ad73e3ae9296935466a7826e9371d02cd Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Fri, 23 Jan 2026 08:34:57 +0100 Subject: [PATCH 17/20] The same as `ebf9ce76ba6` but this time the second TF only handles Tasklets with zero input degree and the SDFG must not be the top level one, i.e. real constant tasklet, that are not at the top level. WORKS: 4.68273s --- .../dace/transformations/auto_optimize.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 54e80fa2b3..ed2f31778e 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -654,6 +654,28 @@ def _gt_auto_process_top_level_maps( return sdfg +class TaskletFusion2(dace_dataflow.TaskletFusion): + """ + Implementation for experiment, that only processes Tasklets with no inputs and + that are not on the top level SDFG. + """ + + def can_be_applied( + self, + graph: dace.SDFGState, + expr_index: int, + sdfg: dace.SDFG, + permissive: bool = False, + ) -> bool: + if sdfg.parent is None: + return False + if graph.in_degree(self.t1) > 0: + return False + return super().can_be_applied( + graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive + ) + + def _gt_auto_process_dataflow_inside_maps( sdfg: dace.SDFG, blocking_dim: Optional[gtx_common.Dimension], @@ -729,7 +751,7 @@ def _gt_auto_process_dataflow_inside_maps( # SDFG and it was indicating that TF was not run for a second time. Thus # I have to redo the experiment which is done here. sdfg.apply_transformations_repeated( - dace_dataflow.TaskletFusion, + TaskletFusion2, validate=False, validate_all=validate_all, ) From 03a4d6791acc4942b03afc50af07943267e83a4c Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Fri, 23 Jan 2026 09:20:55 +0100 Subject: [PATCH 18/20] Slight variation of what we did in `42ecba3ad7`, instead of essentially ignoreing empty memlet (which the previous experiment should have been done), we only consider them. SLOW: 6.658746s --- .../runners/dace/transformations/auto_optimize.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index ed2f31778e..7f91ef9e72 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -655,10 +655,7 @@ def _gt_auto_process_top_level_maps( class TaskletFusion2(dace_dataflow.TaskletFusion): - """ - Implementation for experiment, that only processes Tasklets with no inputs and - that are not on the top level SDFG. - """ + """Version of TaskletFusion` that _only_ processes Tasklet that have an empty Memlet.""" def can_be_applied( self, @@ -667,9 +664,7 @@ def can_be_applied( sdfg: dace.SDFG, permissive: bool = False, ) -> bool: - if sdfg.parent is None: - return False - if graph.in_degree(self.t1) > 0: + if not any(e.data.is_empty() for e in graph.in_edges(self.t1)): return False return super().can_be_applied( graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive From 5dd1a69029f0de572d1538f991fcbbfec5a09303 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Fri, 23 Jan 2026 09:38:08 +0100 Subject: [PATCH 19/20] This is the real test, now like in `03a4d6791a` but now we ignoring all that have empty memlets. FAST: 4.5654s --- .../runners/dace/transformations/auto_optimize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 7f91ef9e72..daa0cb985e 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -664,7 +664,7 @@ def can_be_applied( sdfg: dace.SDFG, permissive: bool = False, ) -> bool: - if not any(e.data.is_empty() for e in graph.in_edges(self.t1)): + if any(e.data.is_empty() for e in graph.in_edges(self.t1)): return False return super().can_be_applied( graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive From cef303dff93d8c3ab0d83c0d326f9bc3dea2710b Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Fri, 23 Jan 2026 10:42:35 +0100 Subject: [PATCH 20/20] Essentially the same as in `42ecba3ad7`, i.e. only processing Tasklet that are nested, but in a more liberal way. FAST: 4.63308429 --- .../runners/dace/transformations/auto_optimize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index daa0cb985e..7259bcb586 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -655,7 +655,7 @@ def _gt_auto_process_top_level_maps( class TaskletFusion2(dace_dataflow.TaskletFusion): - """Version of TaskletFusion` that _only_ processes Tasklet that have an empty Memlet.""" + """Version of TaskletFusion` that _only_ processes Tasklet that are not on the top level.""" def can_be_applied( self, @@ -664,7 +664,7 @@ def can_be_applied( sdfg: dace.SDFG, permissive: bool = False, ) -> bool: - if any(e.data.is_empty() for e in graph.in_edges(self.t1)): + if sdfg.parent is None: return False return super().can_be_applied( graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive