diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 1d04c21fc3..7259bcb586 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -15,6 +15,7 @@ import dace from dace import data as dace_data from dace.sdfg import nodes as dace_nodes, propagation as dace_propagation, utils as dace_sdutils +from dace.transformation import dataflow as dace_dataflow from dace.transformation.auto import auto_optimize as dace_aoptimize from dace.transformation.passes import analysis as dace_analysis @@ -653,6 +654,23 @@ def _gt_auto_process_top_level_maps( return sdfg +class TaskletFusion2(dace_dataflow.TaskletFusion): + """Version of TaskletFusion` that _only_ processes Tasklet that are not on the top level.""" + + def can_be_applied( + self, + graph: dace.SDFGState, + expr_index: int, + sdfg: dace.SDFG, + permissive: bool = False, + ) -> bool: + if sdfg.parent is None: + return False + return super().can_be_applied( + graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive + ) + + def _gt_auto_process_dataflow_inside_maps( sdfg: dace.SDFG, blocking_dim: Optional[gtx_common.Dimension], @@ -674,20 +692,6 @@ def _gt_auto_process_dataflow_inside_maps( time, so the compiler will fully unroll them anyway. """ - # Constants (tasklets are needed to write them into a variable) should not be - # arguments to a kernel but be present inside the body. - sdfg.apply_transformations_once_everywhere( - gtx_transformations.GT4PyMoveTaskletIntoMap, - validate=False, - validate_all=validate_all, - ) - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, - validate=False, - validate_all=validate_all, - ) - # Blocking is performed first, because this ensures that as much as possible # is moved into the k independent part. if blocking_dim is not None: @@ -701,6 +705,27 @@ def _gt_auto_process_dataflow_inside_maps( validate_all=validate_all, ) + # Empirical observation in MuPhys have shown that running `TaskletFusion` increases + # performance quite drastically. Thus it was added here. However, to ensure + # that `LoopBlocking` still works, i.e. independent and dependent Tasklets are + # not mixed it must run _after_ `LoopBlocking`. Furthermore, it has been shown + # that it has to run _before_ `GT4PyMoveTaskletIntoMap`. The reasons are not + # clear but it can be measured. + # TODO(phimuell): Restrict it to Tasklets only inside Maps. + sdfg.apply_transformations_repeated( + dace_dataflow.TaskletFusion, + validate=False, + validate_all=validate_all, + ) + + # Constants (tasklets are needed to write them into a variable) should not be + # arguments to a kernel but be present inside the body. + sdfg.apply_transformations_once_everywhere( + gtx_transformations.GT4PyMoveTaskletIntoMap, + validate=False, + validate_all=validate_all, + ) + # Move dataflow into the branches of the `if` such that they are only evaluated # if they are needed. Important to call it repeatedly. # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called @@ -714,9 +739,14 @@ def _gt_auto_process_dataflow_inside_maps( validate=False, validate_all=validate_all, ) - gtx_transformations.gt_simplify( - sdfg, - skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST, + + # Apparently there was an error/mistake when I did the original `1047b06ff` + # experiment. In that experiment I placed here another call to TF. Back then + # it appeared to be fast, which was super strange. It then checked the archived + # SDFG and it was indicating that TF was not run for a second time. Thus + # I have to redo the experiment which is done here. + sdfg.apply_transformations_repeated( + TaskletFusion2, validate=False, validate_all=validate_all, )