diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
index 1d04c21fc3..7259bcb586 100644
--- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
+++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
@@ -15,6 +15,7 @@
 import dace
 from dace import data as dace_data
 from dace.sdfg import nodes as dace_nodes, propagation as dace_propagation, utils as dace_sdutils
+from dace.transformation import dataflow as dace_dataflow
 from dace.transformation.auto import auto_optimize as dace_aoptimize
 from dace.transformation.passes import analysis as dace_analysis
 
@@ -653,6 +654,23 @@ def _gt_auto_process_top_level_maps(
     return sdfg
 
 
+class TaskletFusion2(dace_dataflow.TaskletFusion):
+    """Version of TaskletFusion` that _only_ processes Tasklet that are not on the top level."""
+
+    def can_be_applied(
+        self,
+        graph: dace.SDFGState,
+        expr_index: int,
+        sdfg: dace.SDFG,
+        permissive: bool = False,
+    ) -> bool:
+        if sdfg.parent is None:
+            return False
+        return super().can_be_applied(
+            graph=graph, expr_index=expr_index, sdfg=sdfg, permissive=permissive
+        )
+
+
 def _gt_auto_process_dataflow_inside_maps(
     sdfg: dace.SDFG,
     blocking_dim: Optional[gtx_common.Dimension],
@@ -674,20 +692,6 @@ def _gt_auto_process_dataflow_inside_maps(
     time, so the compiler will fully unroll them anyway.
     """
 
-    # Constants (tasklets are needed to write them into a variable) should not be
-    #  arguments to a kernel but be present inside the body.
-    sdfg.apply_transformations_once_everywhere(
-        gtx_transformations.GT4PyMoveTaskletIntoMap,
-        validate=False,
-        validate_all=validate_all,
-    )
-    gtx_transformations.gt_simplify(
-        sdfg,
-        skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST,
-        validate=False,
-        validate_all=validate_all,
-    )
-
     # Blocking is performed first, because this ensures that as much as possible
     #  is moved into the k independent part.
     if blocking_dim is not None:
@@ -701,6 +705,27 @@ def _gt_auto_process_dataflow_inside_maps(
             validate_all=validate_all,
         )
 
+    # Empirical observation in MuPhys have shown that running `TaskletFusion` increases
+    #  performance quite drastically. Thus it was added here. However, to ensure
+    #  that `LoopBlocking` still works, i.e. independent and dependent Tasklets are
+    #  not mixed it must run _after_ `LoopBlocking`. Furthermore, it has been shown
+    #  that it has to run _before_ `GT4PyMoveTaskletIntoMap`. The reasons are not
+    #  clear but it can be measured.
+    # TODO(phimuell): Restrict it to Tasklets only inside Maps.
+    sdfg.apply_transformations_repeated(
+        dace_dataflow.TaskletFusion,
+        validate=False,
+        validate_all=validate_all,
+    )
+
+    # Constants (tasklets are needed to write them into a variable) should not be
+    #  arguments to a kernel but be present inside the body.
+    sdfg.apply_transformations_once_everywhere(
+        gtx_transformations.GT4PyMoveTaskletIntoMap,
+        validate=False,
+        validate_all=validate_all,
+    )
+
     # Move dataflow into the branches of the `if` such that they are only evaluated
     #  if they are needed. Important to call it repeatedly.
     # TODO(phimuell): It is unclear if `MoveDataflowIntoIfBody` should be called
@@ -714,9 +739,14 @@ def _gt_auto_process_dataflow_inside_maps(
         validate=False,
         validate_all=validate_all,
     )
-    gtx_transformations.gt_simplify(
-        sdfg,
-        skip=gtx_transformations.constants._GT_AUTO_OPT_INNER_DATAFLOW_STAGE_SIMPLIFY_SKIP_LIST,
+
+    # Apparently there was an error/mistake when I did the original `1047b06ff`
+    #  experiment. In that experiment I placed here another call to TF. Back then
+    #  it appeared to be fast, which was super strange. It then checked the archived
+    #  SDFG and it was indicating that TF was not run for a second time. Thus
+    #  I have to redo the experiment which is done here.
+    sdfg.apply_transformations_repeated(
+        TaskletFusion2,
         validate=False,
         validate_all=validate_all,
     )