Skip to content
Draft
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9cade71
Let's try this order.
philip-paul-mueller Jan 22, 2026
cba0996
Maybe this is better.
philip-paul-mueller Jan 22, 2026
8a389ee
Maybe the simplify call was unneeded.
philip-paul-mueller Jan 22, 2026
244dc10
This is nearer at the empirical version, let's try it.
philip-paul-mueller Jan 22, 2026
ac2c5ce
This is a bit nicer than the previous version, i.e. it has an explana…
philip-paul-mueller Jan 22, 2026
b68bae5
An experiment, let's see what happens.
philip-paul-mueller Jan 22, 2026
8211e57
Revert "An experiment, let's see what happens."
philip-paul-mueller Jan 22, 2026
aa6afa2
New experiment with a different inliner.
philip-paul-mueller Jan 22, 2026
03581a5
Revert "New experiment with a different inliner."
philip-paul-mueller Jan 22, 2026
c404174
Now let's check what happens if we at the end of the fast wersion run…
philip-paul-mueller Jan 22, 2026
cef7ebb
Revert "Now let's check what happens if we at the end of the fast wer…
philip-paul-mueller Jan 22, 2026
a17e6bf
Essentially the same experiment as in `c40417421324dc274d3ce5fa01db21…
philip-paul-mueller Jan 22, 2026
dcc4b5e
This version should be fast, but as soon as we know we will enabled t…
philip-paul-mueller Jan 22, 2026
1047b06
Now let's activate the second FT pass.
philip-paul-mueller Jan 22, 2026
0bffbcc
New experiment with running it twice.
philip-paul-mueller Jan 22, 2026
ebf9ce7
Apparently experiment `1047b06ff0151ec15a426b694cd9f3f1cfa1d860` was …
philip-paul-mueller Jan 23, 2026
42ecba3
The same as `ebf9ce76ba6` but this time the second TF only handles Ta…
philip-paul-mueller Jan 23, 2026
03a4d67
Slight variation of what we did in `42ecba3ad7`, instead of essential…
philip-paul-mueller Jan 23, 2026
5dd1a69
This is the real test, now like in `03a4d6791a` but now we ignoring a…
philip-paul-mueller Jan 23, 2026
cef303d
Essentially the same as in `42ecba3ad7`, i.e. only processing Tasklet…
philip-paul-mueller Jan 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import dace
from dace import data as dace_data
from dace.sdfg import nodes as dace_nodes, propagation as dace_propagation, utils as dace_sdutils
from dace.transformation import dataflow as dace_dataflow
from dace.transformation.auto import auto_optimize as dace_aoptimize
from dace.transformation.passes import analysis as dace_analysis

Expand Down Expand Up @@ -653,6 +654,23 @@ def _gt_auto_process_top_level_maps(
return sdfg


class TaskletFusion2(dace_dataflow.TaskletFusion):
"""Version of TaskletFusion` that _only_ processes Tasklet that are not on the top level."""

def can_be_applied(
self,
graph: dace.SDFGState,
expr_index: int,
sdfg: dace.SDFG,
permissive: bool = False,
) -> bool:
if sdfg.parent is None:
return False
return super().can_be_applied(
graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive
)


def _gt_auto_process_dataflow_inside_maps(
sdfg: dace.SDFG,
blocking_dim: Optional[gtx_common.Dimension],
Expand All @@ -674,20 +692,6 @@ def _gt_auto_process_dataflow_inside_maps(
time, so the compiler will fully unroll them anyway.
"""

# Constants (tasklets are needed to write them into a variable) should not be
# arguments to a kernel but be present inside the body.
sdfg.apply_transformations_once_everywhere(
gtx_transformations.GT4PyMoveTaskletIntoMap,
validate=False,
validate_all=validate_all,
)
gtx_transformations.gt_simplify(
sdfg,
skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST,
validate=False,
validate_all=validate_all,
)

# Blocking is performed first, because this ensures that as much as possible
# is moved into the k independent part.
if blocking_dim is not None:
Expand All @@ -701,6 +705,27 @@ def _gt_auto_process_dataflow_inside_maps(
validate_all=validate_all,
)

# Empirical observation in MuPhys have shown that running `TaskletFusion` increases
# performance quite drastically. Thus it was added here. However, to ensure
# that `LoopBlocking` still works, i.e. independent and dependent Tasklets are
# not mixed it must run _after_ `LoopBlocking`. Furthermore, it has been shown
# that it has to run _before_ `GT4PyMoveTaskletIntoMap`. The reasons are not
# clear but it can be measured.
# TODO(phimuell): Restrict it to Tasklets only inside Maps.
sdfg.apply_transformations_repeated(
dace_dataflow.TaskletFusion,
validate=False,
validate_all=validate_all,
)

# Constants (tasklets are needed to write them into a variable) should not be
# arguments to a kernel but be present inside the body.
sdfg.apply_transformations_once_everywhere(
gtx_transformations.GT4PyMoveTaskletIntoMap,
validate=False,
validate_all=validate_all,
)

# Move dataflow into the branches of the `if` such that they are only evaluated
# if they are needed. Important to call it repeatedly.
# TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called
Expand All @@ -714,9 +739,14 @@ def _gt_auto_process_dataflow_inside_maps(
validate=False,
validate_all=validate_all,
)
gtx_transformations.gt_simplify(
sdfg,
skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST,

# Apparently there was an error/mistake when I did the original `1047b06ff`
# experiment. In that experiment I placed here another call to TF. Back then
# it appeared to be fast, which was super strange. It then checked the archived
# SDFG and it was indicating that TF was not run for a second time. Thus
# I have to redo the experiment which is done here.
sdfg.apply_transformations_repeated(
TaskletFusion2,
validate=False,
validate_all=validate_all,
)
Expand Down