diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt
index 5a8e6438eb..ce71cda1a6 100644
--- a/dace/codegen/CMakeLists.txt
+++ b/dace/codegen/CMakeLists.txt
@@ -58,7 +58,8 @@ foreach(DACE_FILE ${DACE_FILES})
   # Make the path absolute
   set(DACE_FILE ${DACE_SRC_DIR}/${DACE_FILE})
   # Now treat the file according to the deduced target
-  if(${DACE_FILE_TARGET} STREQUAL "cuda")
+  # previous: if(${DACE_FILE_TARGET} STREQUAL "cuda"). Needed to work with experimental
+  if(${DACE_FILE_TARGET} STREQUAL "experimental_cuda" OR ${DACE_FILE_TARGET} STREQUAL "cuda")
     if(${DACE_FILE_TARGET_TYPE} MATCHES "hip")
       set(DACE_ENABLE_HIP ON)
       set(DACE_HIP_FILES ${DACE_HIP_FILES} ${DACE_FILE})
diff --git a/dace/codegen/instrumentation/gpu_events.py b/dace/codegen/instrumentation/gpu_events.py
index 9c653342cd..99a91e3b3f 100644
--- a/dace/codegen/instrumentation/gpu_events.py
+++ b/dace/codegen/instrumentation/gpu_events.py
@@ -129,7 +129,7 @@ def on_scope_entry(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, n
                                 'GPU_Device map scopes')
 
             idstr = 'b' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
 
     def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.ExitNode,
@@ -139,7 +139,7 @@ def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no
         s = self._get_sobj(node)
         if s.instrument == dtypes.InstrumentationType.GPU_Events:
             idstr = 'e' + self._idstr(cfg, state, entry_node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
             outer_stream.write(self._report('%s %s' % (type(s).__name__, s.label), cfg, state, entry_node), cfg,
                                state_id, node)
@@ -153,7 +153,7 @@ def on_node_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no
         if node.instrument == dtypes.InstrumentationType.GPU_Events:
             state_id = state.parent_graph.node_id(state)
             idstr = 'b' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
 
     def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node,
@@ -165,7 +165,46 @@ def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node
         if node.instrument == dtypes.InstrumentationType.GPU_Events:
             state_id = state.parent_graph.node_id(state)
             idstr = 'e' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
             outer_stream.write(self._report('%s %s' % (type(node).__name__, node.label), cfg, state, node), cfg,
                                state_id, node)
+
+    def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int:
+        """
+        Return the GPU stream ID assigned to a given node.
+
+        - In the CUDACodeGen, the stream ID is stored as the private attribute
+          ``_cuda_stream`` on the node.
+        - In the ExperimentalCUDACodeGen, streams are explicitly assigned to tasklets
+          and GPU_Device-scheduled maps (kernels) via a GPU stream AccessNode. For
+          other node types, no reliable stream assignment is available.
+
+        Parameters
+        ----------
+        state : SDFGState
+            The state containing the node.
+        node : dace.sdfg.nodes.Node
+            The node for which to query the GPU stream.
+
+        Returns
+        -------
+        int
+            The assigned GPU stream ID, or ``-1`` if none could be determined.
+        """
+        if config.Config.get('compiler', 'cuda', 'implementation') == 'legacy':
+            stream = getattr(node, '_cuda_stream', -1)
+
+        else:
+            stream = -1
+            for in_edge in state.in_edges(node):
+                src = in_edge.src
+                if (isinstance(src, nodes.AccessNode) and src.desc(state).dtype == dtypes.gpuStream_t):
+                    stream = int(in_edge.data.subset)
+
+            for out_edge in state.out_edges(node):
+                dst = out_edge.dst
+                if (isinstance(dst, nodes.AccessNode) and dst.desc(state).dtype == dtypes.gpuStream_t):
+                    stream = int(out_edge.data.subset)
+
+        return stream
diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py
index cd4d5f957f..5c9027e68e 100644
--- a/dace/codegen/targets/__init__.py
+++ b/dace/codegen/targets/__init__.py
@@ -9,3 +9,4 @@
 from .mlir.mlir import MLIRCodeGen
 from .sve.codegen import SVECodeGen
 from .snitch import SnitchCodeGen
+from .experimental_cuda import ExperimentalCUDACodeGen
diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index b451668831..12f09ba42c 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -236,14 +236,22 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher',
 
 def is_cuda_codegen_in_device(framecode) -> bool:
     """
-    Check the state of the CUDA code generator, whether it is inside device code.
+    Check the state of the (Experimental) CUDA code generator, whether it is inside device code.
     """
     from dace.codegen.targets.cuda import CUDACodeGen
+    from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen
+
+    cuda_impl = Config.get('compiler', 'cuda', 'implementation')
+    if cuda_impl == 'legacy':
+        cudaClass = CUDACodeGen
+    elif cuda_impl == 'experimental':
+        cudaClass = ExperimentalCUDACodeGen
+
     if framecode is None:
         cuda_codegen_in_device = False
     else:
         for codegen in framecode.targets:
-            if isinstance(codegen, CUDACodeGen):
+            if isinstance(codegen, cudaClass):
                 cuda_codegen_in_device = codegen._in_device_code
                 break
         else:
@@ -266,11 +274,9 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str:
         root = name.split('.')[0]
         if root in sdfg.arrays and isinstance(sdfg.arrays[root], data.Structure):
             name = name.replace('.', '->')
-
     # Special case: If memory is persistent and defined in this SDFG, add state
     # struct to name
     if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)):
-
         if desc.storage == dtypes.StorageType.CPU_ThreadLocal:  # Use unambiguous name for thread-local arrays
             return f'__{sdfg.cfg_id}_{name}'
         elif not is_cuda_codegen_in_device(framecode):  # GPU kernels cannot access state
@@ -936,7 +942,7 @@ def unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, callsite_st
         # set the stream to a local variable.
         max_streams = int(Config.get("compiler", "cuda", "max_concurrent_streams"))
         if not is_devicelevel_gpu(sdfg, state_dfg, node) and (hasattr(node, "_cuda_stream")
-                                                              or connected_to_gpu_memory(node, state_dfg, sdfg)):
+                                                              and connected_to_gpu_memory(node, state_dfg, sdfg)):
             if max_streams >= 0:
                 callsite_stream.write(
                     'int __dace_current_stream_id = %d;\n%sStream_t __dace_current_stream = __state->gpu_context->streams[__dace_current_stream_id];'
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 5e71cbb074..228613bae7 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -513,6 +513,13 @@ def allocate_array(self,
 
             return
         elif (nodedesc.storage == dtypes.StorageType.Register):
+
+            if nodedesc.dtype == dtypes.gpuStream_t:
+                ctype = dtypes.gpuStream_t.ctype
+                allocation_stream.write(f"{ctype}* {name} = __state->gpu_context->streams;")
+                define_var(name, DefinedType.Pointer, ctype)
+                return
+
             ctypedef = dtypes.pointer(nodedesc.dtype).ctype
             if nodedesc.start_offset != 0:
                 raise NotImplementedError('Start offset unsupported for registers')
@@ -588,6 +595,9 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap
 
         if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)):
             return
+        elif nodedesc.dtype == dtypes.gpuStream_t:
+            callsite_stream.write(f"{alloc_name} = nullptr;")
+            return
         elif (nodedesc.storage == dtypes.StorageType.CPU_Heap
               or (nodedesc.storage == dtypes.StorageType.Register and
                   (symbolic.issymbolic(arrsize, sdfg.constants) or
@@ -1008,6 +1018,11 @@ def process_out_memlets(self,
             dst_edge = dfg.memlet_path(edge)[-1]
             dst_node = dst_edge.dst
 
+            if isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state).dtype == dtypes.gpuStream_t:
+                # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks
+                # Thus, nothing needs to be written and out memlets of this kind should be ignored.
+                continue
+
             # Target is neither a data nor a tasklet node
             if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode)
                                                        and not isinstance(dst_node, nodes.CodeNode)):
@@ -1049,6 +1064,7 @@ def process_out_memlets(self,
             # Tasklet -> array with a memlet. Writing to array is emitted only if the memlet is not empty
             if isinstance(node, nodes.CodeNode) and not edge.data.is_empty():
                 if not uconn:
+                    return
                     raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format(
                         str(edge.src), str(edge.dst)))
 
@@ -1585,6 +1601,10 @@ def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: State
         cdtype = src_node.out_connectors[edge.src_conn]
         if isinstance(sdfg.arrays[edge.data.data], data.Stream):
             pass
+        elif isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state_dfg).dtype == dtypes.gpuStream_t:
+            # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks
+            # Thus, nothing needs to be written.
+            pass
         elif isinstance(cdtype, dtypes.pointer):  # If pointer, also point to output
             desc = sdfg.arrays[edge.data.data]
 
diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py
new file mode 100644
index 0000000000..0d3dce577c
--- /dev/null
+++ b/dace/codegen/targets/experimental_cuda.py
@@ -0,0 +1,1552 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
+import networkx as nx
+
+import dace
+from dace import data as dt, Memlet
+from dace import dtypes, registry, symbolic, subsets
+from dace.config import Config
+from dace.sdfg import SDFG, ScopeSubgraphView, SDFGState, nodes
+from dace.sdfg import utils as sdutil
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.sdfg.state import ControlFlowRegion, StateSubgraphView
+
+from dace.codegen import common
+from dace.codegen.codeobject import CodeObject
+from dace.codegen.dispatcher import DefinedType, TargetDispatcher
+from dace.codegen.prettycode import CodeIOStream
+from dace.codegen.common import update_persistent_desc
+from dace.codegen.targets.cpp import (codeblock_to_cpp, memlet_copy_to_absolute_strides, mangle_dace_state_struct_name,
+                                      ptr, sym2cpp)
+from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute
+
+# DaCe transformation imports
+from dace.transformation.passes import analysis as ap
+from dace.transformation.pass_pipeline import Pipeline
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets
+from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets
+from dace.transformation.passes.gpustream.gpu_stream_topology_simplification import GPUStreamTopologySimplification
+from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets
+from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync
+from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap
+from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize
+
+# Experimental CUDA helper imports
+from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager
+from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import generate_sync_debug_call, get_defined_type
+
+from dace.codegen.targets import cpp
+
+# Type checking imports (conditional)
+if TYPE_CHECKING:
+    from dace.codegen.targets.framecode import DaCeCodeGenerator
+    from dace.codegen.targets.cpu import CPUCodeGen
+
+
+@registry.autoregister_params(name='experimental_cuda')
+class ExperimentalCUDACodeGen(TargetCodeGenerator):
+    """ Experimental CUDA code generator."""
+    target_name = 'experimental_cuda'
+    title = 'CUDA'
+
+    ###########################################################################
+    # Initialization & Preprocessing
+
+    def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
+
+        self._frame: DaCeCodeGenerator = frame_codegen  # creates the frame code, orchestrates the code generation for targets
+        self._dispatcher: TargetDispatcher = frame_codegen.dispatcher  # responsible for dispatching code generation to the appropriate target
+
+        self._in_device_code = False
+        self._cpu_codegen: Optional['CPUCodeGen'] = None
+
+        # NOTE: Moved from preprossessing to here
+        self.backend: str = common.get_gpu_backend()
+        self.language = 'cu' if self.backend == 'cuda' else 'cpp'
+        target_type = '' if self.backend == 'cuda' else self.backend
+        self._codeobject = CodeObject(sdfg.name + '_' + 'cuda',
+                                      '',
+                                      self.language,
+                                      ExperimentalCUDACodeGen,
+                                      'CUDA',
+                                      target_type=target_type)
+
+        self._localcode = CodeIOStream()
+        self._globalcode = CodeIOStream()
+
+        # TODO: init and exitcode seem to serve no purpose actually.
+        self._initcode = CodeIOStream()
+        self._exitcode = CodeIOStream()
+
+        self._global_sdfg: SDFG = sdfg
+        self._toplevel_schedule = None
+
+        # Positions at which to deallocate memory pool arrays
+        self.pool_release: Dict[Tuple[SDFG, str], Tuple[SDFGState, Set[nodes.Node]]] = {}
+        self.has_pool = False
+
+        # INFO:
+        # Register GPU schedules and storage types for ExperimentalCUDACodeGen.
+        # The dispatcher maps GPU-related schedules and storage types to the
+        # appropriate code generation functions in this code generator.
+
+        # Register dispatchers
+        self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher()
+
+        self._dispatcher = frame_codegen.dispatcher
+        self._dispatcher.register_map_dispatcher(dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN, self)
+        self._dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate)
+        self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate)
+
+        # TODO: Add this to dtypes as well
+        gpu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned]
+
+        self._dispatcher.register_array_dispatcher(gpu_storage, self)
+        self._dispatcher.register_array_dispatcher(dtypes.StorageType.CPU_Pinned, self)
+        for storage in gpu_storage:
+            for other_storage in dtypes.StorageType:
+                self._dispatcher.register_copy_dispatcher(storage, other_storage, None, self)
+                self._dispatcher.register_copy_dispatcher(other_storage, storage, None, self)
+
+        # NOTE:
+        # "Register illegal copies" code NOT copied from cuda.py
+        #  Was never needed.
+
+        ################## New variables ##########################
+
+        self._current_kernel_spec: Optional[KernelSpec] = None
+        self._gpu_stream_manager: Optional[GPUStreamManager] = None
+        self._kernel_dimensions_map: Set[nodes.MapEntry] = set()
+        self._kernel_arglists: Dict[nodes.MapEntry, Dict[str, dt.Data]] = {}
+
+    def preprocess(self, sdfg: SDFG) -> None:
+        """
+        Preprocess the SDFG to prepare it for GPU code generation. This includes:
+        - Handling GPU<->GPU strided copies.
+        - Adding explicit ThreadBlock Maps where missing and infer Grid and Block dimensions for
+          every Kernel in the SDFG
+        - Runs a pipeline for making GPU stream explicit at the SDFG level and handles other
+          GPU stream related initialization.
+        - TODO
+        - Handling memory pool management
+
+        Note that the order of the steps matters, e.g. TODO
+        """
+
+        #------------------------- Hanlde GPU<->GPU strided copies --------------------------
+
+        # Find GPU<->GPU strided copies that cannot be represented by a single copy command
+        from dace.transformation.dataflow import CopyToMap
+        for e, state in list(sdfg.all_edges_recursive()):
+            if isinstance(e.src, nodes.AccessNode) and isinstance(e.dst, nodes.AccessNode):
+                nsdfg = state.parent
+                if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global
+                        and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global):
+                    copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides(
+                        None, nsdfg, state, e, e.src, e.dst)
+                    dims = len(copy_shape)
+
+                    # Skip supported copy types
+                    if dims == 1:
+                        continue
+                    elif dims == 2:
+                        if src_strides[-1] != 1 or dst_strides[-1] != 1:
+                            # NOTE: Special case of continuous copy
+                            # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J]
+                            # with copy shape [I, J] and strides [J*K, K], [J, 1]
+                            try:
+                                is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
+                                is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
+                            except (TypeError, ValueError):
+                                is_src_cont = False
+                                is_dst_cont = False
+                            if is_src_cont and is_dst_cont:
+                                continue
+                        else:
+                            continue
+                    elif dims > 2:
+                        if not (src_strides[-1] != 1 or dst_strides[-1] != 1):
+                            continue
+
+                    # Turn unsupported copy to a map
+                    try:
+                        CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst)
+                    except ValueError:  # If transformation doesn't match, continue normally
+                        continue
+
+        #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes --------------------
+
+        # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion
+        # of ThreadBlock maps. Note: the original Kernel Entry was transformed into a ThreadBlock map,
+        # and a new GPU_Device (i.e., Kernel) map was inserted on top of it.
+        old_nodes = set(node for node, _ in sdfg.all_nodes_recursive())
+
+        # Insert default explicit GPU_ThreadBlock maps where they are missing
+        sdfg.apply_transformations_once_everywhere(AddThreadBlockMap)
+
+        new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes
+        kernels_with_added_tb_maps = {
+            n
+            for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device
+        }
+
+        # Infer GPU Grid and Block dimensions
+        self._kernel_dimensions_map = InferGPUGridAndBlockSize().apply_pass(sdfg, kernels_with_added_tb_maps)
+
+        #------------------------- GPU Stream related Logic --------------------------
+
+        # Register GPU context in state struct
+        self._frame.statestruct.append('dace::cuda::Context *gpu_context;')
+
+        # Prepare the Pipeline to make GPU streams explicit: Add and connect SDFG nodes
+        # with GPU stream AccessNodes where used
+        stream_pipeline = Pipeline([
+            NaiveGPUStreamScheduler(),
+            InsertGPUStreamsToSDFGs(),
+            InsertGPUStreamsToKernels(),
+            InsertGPUStreamsToTasklets(),
+            InsertGPUStreamSyncTasklets(),
+            InsertGPUCopyTasklets(),
+            GPUStreamTopologySimplification(),
+        ])
+
+        # TODO: Missed copies due to InsertGPUCopyTasklet -> maybe check wheter copies were
+        # handled above than just adding this codegen to used_targets by default
+        self._dispatcher._used_targets.add(self)
+        gpustream_assignments = stream_pipeline.apply_pass(sdfg, {})['NaiveGPUStreamScheduler']
+
+        # Initialize runtime GPU stream manager
+        self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments)
+
+        #----------------- Shared Memory Synchronization related Logic -----------------
+
+        auto_sync = Config.get('compiler', 'cuda', 'auto_syncthreads_insertion')
+        if auto_sync:
+            DefaultSharedMemorySync().apply_pass(sdfg, None)
+
+        #------------------------- Memory Pool related Logic --------------------------
+
+        # Find points where memory should be released to the memory pool
+        self._compute_pool_release(sdfg)
+
+        # Retrieve arguments required for the kernels subgraph
+        shared_transients = {}
+        for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True):
+            if (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
+                if state.parent not in shared_transients:
+                    shared_transients[state.parent] = state.parent.shared_transients()
+                self._kernel_arglists[node] = state.scope_subgraph(node).arglist(defined_syms,
+                                                                                 shared_transients[state.parent])
+
+    def _compute_pool_release(self, top_sdfg: SDFG):
+        """
+        Computes positions in the code generator where a memory pool array is no longer used and
+        ``backendFreeAsync`` should be called to release it.
+
+        :param top_sdfg: The top-level SDFG to traverse.
+        :raises ValueError: If the backend does not support memory pools.
+        """
+        # Find release points for every array in every SDFG
+        reachability = access_nodes = None
+        for sdfg in top_sdfg.all_sdfgs_recursive():
+            # Skip SDFGs without memory pool hints
+            pooled = set(aname for aname, arr in sdfg.arrays.items()
+                         if getattr(arr, 'pool', False) is True and arr.transient)
+            if not pooled:
+                continue
+            self.has_pool = True
+            if self.backend != 'cuda':
+                raise ValueError(f'Backend "{self.backend}" does not support the memory pool allocation hint')
+
+            # Keep only global arrays
+            pooled = filter(
+                lambda aname: sdfg.arrays[aname].lifetime in
+                (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.
+                 External), pooled)
+
+            # Lazily compute reachability and access nodes
+            if reachability is None:
+                reachability = ap.StateReachability().apply_pass(top_sdfg, {})
+                access_nodes = ap.FindAccessStates().apply_pass(top_sdfg, {})
+
+            reachable = reachability[sdfg.cfg_id]
+            access_sets = access_nodes[sdfg.cfg_id]
+            for state in sdfg.states():
+                # Find all data descriptors that will no longer be used after this state
+                last_state_arrays: Set[str] = set(
+                    s for s in access_sets
+                    if s in pooled and state in access_sets[s] and not (access_sets[s] & reachable[state]) - {state})
+
+                anodes = list(state.data_nodes())
+                for aname in last_state_arrays:
+                    # Find out if there is a common descendant access node.
+                    # If not, release at end of state
+                    ans = [an for an in anodes if an.data == aname]
+                    terminator = None
+                    for an1 in ans:
+                        if all(nx.has_path(state.nx, an2, an1) for an2 in ans if an2 is not an1):
+                            terminator = an1
+                            break
+
+                    # Old logic below, now we use the gpu_stream manager which returns nullptr automatically
+                    # to all nodes thatdid not got assigned a cuda stream
+                    """
+                    # Enforce a cuda_stream field so that the state-wide deallocation would work
+                    if not hasattr(an1, '_cuda_stream'):
+                        an1._cuda_stream = 'nullptr'
+                    """
+
+                    # If access node was found, find the point where all its reads are complete
+                    terminators = set()
+                    if terminator is not None:
+                        parent = state.entry_node(terminator)
+                        # If within a scope, once all memlet paths going out of that scope are complete,
+                        # it is time to release the memory
+                        if parent is not None:
+                            # Just to be safe, release at end of state (e.g., if misused in Sequential map)
+                            terminators = set()
+                        else:
+                            # Otherwise, find common descendant (or end of state) following the ends of
+                            # all memlet paths (e.g., (a)->...->[tasklet]-->...->(b))
+                            for e in state.out_edges(terminator):
+                                if isinstance(e.dst, nodes.EntryNode):
+                                    terminators.add(state.exit_node(e.dst))
+                                else:
+                                    terminators.add(e.dst)
+                            # After all outgoing memlets of all the terminators have been processed, memory
+                            # will be released
+
+                    self.pool_release[(sdfg, aname)] = (state, terminators)
+
+            # If there is unfreed pooled memory, free at the end of the SDFG
+            unfreed = set(arr for arr in pooled if (sdfg, arr) not in self.pool_release)
+            if unfreed:
+                # Find or make single sink node
+                sinks = sdfg.sink_nodes()
+                if len(sinks) == 1:
+                    sink = sinks[0]
+                elif len(sinks) > 1:
+                    sink = sdfg.add_state()
+                    for s in sinks:
+                        sdfg.add_edge(s, sink)
+                else:  # len(sinks) == 0:
+                    raise ValueError('End state not found when trying to free pooled memory')
+
+                # Add sink as terminator state
+                for arr in unfreed:
+                    self.pool_release[(sdfg, arr)] = (sink, set())
+
+    ###########################################################################
+    # Determine wheter initializer and finalizer should be called
+
+    @property
+    def has_initializer(self) -> bool:
+        return True
+
+    @property
+    def has_finalizer(self) -> bool:
+        return True
+
+    ###########################################################################
+    # Scope generation
+
+    def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                       function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+
+        # Import strategies here to avoid circular dependencies
+        from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import (ScopeGenerationStrategy,
+                                                                                     KernelScopeGenerator,
+                                                                                     ThreadBlockScopeGenerator,
+                                                                                     WarpScopeGenerator)
+        # Entry Node of the scope
+        scope_entry = dfg_scope.source_nodes()[0]
+
+        #--------------- Start of Kernel Function Code Generation --------------------
+
+        if not self._in_device_code:
+
+            # Enter kernel context and recursively generate device code
+
+            state = cfg.state(state_id)
+            scope_entry = dfg_scope.source_nodes()[0]
+            scope_exit = dfg_scope.sink_nodes()[0]
+            scope_entry_stream = CodeIOStream()
+            scope_exit_stream = CodeIOStream()
+
+            # Instrumentation for kernel scope
+            instr = self._dispatcher.instrumentation[scope_entry.map.instrument]
+            if instr is not None:
+                instr.on_scope_entry(sdfg, cfg, state, scope_entry, callsite_stream, scope_entry_stream,
+                                     self._globalcode)
+                outer_stream = CodeIOStream()
+                instr.on_scope_exit(sdfg, cfg, state, scope_exit, outer_stream, scope_exit_stream, self._globalcode)
+
+            # New scope for defined variables (kernel functions scope)
+            self._dispatcher.defined_vars.enter_scope(scope_entry)
+
+            # Store kernel metadata (name, dimensions, arguments, etc.) in a KernelSpec object
+            # and save it as an attribute
+            kernel_spec = KernelSpec(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id)
+
+            self._current_kernel_spec = kernel_spec
+
+            # (Re)define variables for the new scope
+            self._define_variables_in_kernel_scope(sdfg, self._dispatcher)
+
+            # declare and call kernel wrapper function (in the CPU-side code)
+            self._declare_and_invoke_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream)
+
+            # Recursively generate GPU code into the kernel_stream (will be in a .cu file)
+            kernel_stream = CodeIOStream()
+            kernel_function_stream = self._globalcode
+
+            self._in_device_code = True
+
+            kernel_scope_generator = KernelScopeGenerator(codegen=self)
+            if kernel_scope_generator.applicable(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream):
+                kernel_scope_generator.generate(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream)
+            else:
+                raise ValueError("Invalid kernel configuration: This strategy is only applicable if the "
+                                 "outermost GPU schedule is of type GPU_Device (most likely cause).")
+
+            self._localcode.write(scope_entry_stream.getvalue())
+
+            # Append generated kernel code to localcode
+            self._localcode.write(kernel_stream.getvalue() + '\n')
+
+            self._localcode.write(scope_exit_stream.getvalue())
+
+            # Exit kernel context
+            self._in_device_code = False
+
+            # Generate kernel wrapper, i.e. function which will launch the kernel
+            self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream)
+
+            # Exit scope for defined variables
+            self._dispatcher.defined_vars.exit_scope(scope_entry)
+
+            if instr is not None:
+                callsite_stream.write(outer_stream.getvalue())
+
+            return
+
+        import copy
+        from dace.transformation.passes.fix_test import Fix
+        from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel
+        from dace.sdfg import infer_types
+
+        names = Fix().apply_pass(sdfg, {})
+        for name, map_parent in names.items():
+            MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name)
+        infer_types.infer_connector_types(sdfg)
+
+        #--------------- Nested GPU Scope --------------------
+        supported_strategies: List[ScopeGenerationStrategy] = [
+            ThreadBlockScopeGenerator(codegen=self),
+            WarpScopeGenerator(codegen=self)
+        ]
+
+        for strategy in supported_strategies:
+            if strategy.applicable(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream):
+                strategy.generate(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream)
+                return
+
+        #--------------- Unsupported Cases --------------------
+        # Note: We are inside a nested GPU scope at this point.
+
+        schedule_type = scope_entry.map.schedule
+
+        if schedule_type == dace.ScheduleType.GPU_Device:
+            raise NotImplementedError("Dynamic parallelism (nested GPU_Device schedules) is not supported.")
+
+        raise NotImplementedError(
+            f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. "
+            "Please check for supported schedule types or implement the corresponding strategy.")
+
+    def _define_variables_in_kernel_scope(self, sdfg: SDFG, dispatcher: TargetDispatcher):
+        """
+        Define kernel-visible variables in the dispatcher's scope.
+
+        - Certain variables stored in the host-side ``__state`` struct (e.g., persistent or external
+          data) cannot be accessed directly in kernel code. They are passed as arguments instead, with
+          pointer names resolved via ``cpp.ptr(..)``. These must be registered in the dispatcher for use
+          in kernel context.
+
+        - KernelSpec may also mark certain variables/arguments as constants, which must be registered with
+          the appropriate ``const`` qualifier in their ctype.
+        """
+        # Extract argument and constant definitions from the KernelSpec
+        kernel_spec: KernelSpec = self._current_kernel_spec
+        kernel_constants: Set[str] = kernel_spec.kernel_constants
+        kernel_arglist: Dict[str, dt.Data] = kernel_spec.arglist
+
+        # Save current in_device_code value for restoration later
+        restore_in_device_code = self._in_device_code
+        for name, data_desc in kernel_arglist.items():
+
+            # Only arrays relevant
+            if not name in sdfg.arrays:
+                continue
+
+            data_desc = sdfg.arrays[name]
+            # Get the outer/host pointer name
+            self._in_device_code = False
+            host_ptrname = cpp.ptr(name, data_desc, sdfg, self._frame)
+
+            # Get defined type and ctype for the data (use host pointer name)
+            is_global: bool = data_desc.lifetime in (dtypes.AllocationLifetime.Global,
+                                                     dtypes.AllocationLifetime.Persistent,
+                                                     dtypes.AllocationLifetime.External)
+            defined_type, ctype = dispatcher.defined_vars.get(host_ptrname, is_global=is_global)
+
+            # Get the inner/device pointer name
+            self._in_device_code = True
+            device_ptrname = cpp.ptr(name, data_desc, sdfg, self._frame)
+
+            # Add the const qualifier if it is a constant AND is not marked as such yet
+            if name in kernel_constants:
+                if not "const " in ctype:
+                    ctype = f"const {ctype}"
+
+            # Register variable with the device pointer name for the kernel context
+            dispatcher.defined_vars.add(device_ptrname, defined_type, ctype, allow_shadowing=True)
+
+        # Restore in_device_code field
+        self._in_device_code = restore_in_device_code
+
+    def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView,
+                                           state_id: int, function_stream: CodeIOStream,
+                                           callsite_stream: CodeIOStream) -> None:
+
+        scope_entry = dfg_scope.source_nodes()[0]
+
+        kernel_spec: KernelSpec = self._current_kernel_spec
+        kernel_name = kernel_spec.kernel_name
+        kernel_wrapper_args_as_input = kernel_spec.kernel_wrapper_args_as_input
+        kernel_wrapper_args_typed = kernel_spec.kernel_wrapper_args_typed
+
+        # Declaration of the kernel wrapper function (in the CPU-side code)
+        function_stream.write(
+            'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg,
+            state_id, scope_entry)
+
+        # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions.
+        state = cfg.state(state_id)
+        if dace.sdfg.has_dynamic_map_inputs(state, scope_entry):
+            callsite_stream.write('{', cfg, state_id, scope_entry)
+
+        # Synchronize all events leading to dynamic map range connectors
+        for e in dace.sdfg.dynamic_map_inputs(state, scope_entry):
+            callsite_stream.write(
+                self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
+                cfg, state_id, scope_entry)
+
+        # Calling the kernel wrapper function (in the CPU-side code)
+        callsite_stream.write('__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_as_input)),
+                              cfg, state_id, scope_entry)
+
+        # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions.
+        if dace.sdfg.has_dynamic_map_inputs(state, scope_entry):
+            callsite_stream.write('}', cfg, state_id, scope_entry)
+
+    def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                                 function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+
+        scope_entry = dfg_scope.source_nodes()[0]
+
+        kernel_spec: KernelSpec = self._current_kernel_spec
+        kernel_name = kernel_spec.kernel_name
+        kernel_args_as_input = kernel_spec.args_as_input
+        kernel_launch_args_typed = kernel_spec.kernel_wrapper_args_typed
+
+        # get kernel dimensions and transform into a c++ string
+        grid_dims = kernel_spec.grid_dims
+        block_dims = kernel_spec.block_dims
+        gdims = ', '.join(sym2cpp(grid_dims))
+        bdims = ', '.join(sym2cpp(block_dims))
+
+        # ----------------- Kernel Launch Function Declaration -----------------------
+
+        self._localcode.write(
+            f"""
+            DACE_EXPORTED void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)});
+            void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)})
+            """, cfg, state_id, scope_entry)
+
+        # Open bracket
+        self._localcode.write('{', cfg, state_id, scope_entry)
+
+        # ----------------- Guard Checks handling -----------------------
+
+        # Ensure that iteration space is neither empty nor negative sized
+        single_dimchecks = []
+        for gdim in grid_dims:
+            # Only emit a guard if we can't statically prove gdim > 0
+            if (gdim > 0) != True:
+                single_dimchecks.append(f'(({sym2cpp(gdim)}) <= 0)')
+
+        dimcheck = ' || '.join(single_dimchecks)
+
+        if dimcheck:
+            emptygrid_warning = ''
+            if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'):
+                emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" '
+                                     'due to an empty grid.\\n");')
+
+            self._localcode.write(
+                f'''
+                    if ({dimcheck}) {{
+                        {emptygrid_warning}
+                        return;
+                    }}''', cfg, state_id, scope_entry)
+
+        # ----------------- Kernel Launch Invocation -----------------------
+        stream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
+        kargs = ', '.join(['(void *)&' + arg for arg in kernel_args_as_input])
+        self._localcode.write(
+            f'''
+            void  *{kernel_name}_args[] = {{ {kargs} }};
+            gpuError_t __err = {self.backend}LaunchKernel(
+                (void*){kernel_name}, dim3({gdims}), dim3({bdims}), {kernel_name}_args, {0}, {stream_var_name}
+            );
+            ''', cfg, state_id, scope_entry)
+
+        self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});\n')
+        self._localcode.write(generate_sync_debug_call())
+
+        # Close bracket
+        self._localcode.write('}', cfg, state_id, scope_entry)
+
+    ###########################################################################
+    # Generation of Memory Copy Logic
+
+    def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                    src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode],
+                    edge: Tuple[nodes.Node, str, nodes.Node, str,
+                                Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+
+        from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import (CopyContext,
+                                                                                    OutOfKernelCopyStrategy,
+                                                                                    SyncCollaboritveGPUCopyStrategy)
+
+        context = CopyContext(sdfg, cfg.state(state_id), src_node, dst_node, edge,
+                              self._gpu_stream_manager.gpustream_assignments)
+
+        if OutOfKernelCopyStrategy().applicable(context):
+            # Handled during the GPU stream pipeline in preprocess()
+            # in form of explicit tasklets
+            return
+
+        elif SyncCollaboritveGPUCopyStrategy().applicable(context):
+            code = SyncCollaboritveGPUCopyStrategy().generate_copy(context, self._kernel_dimensions_map)
+            callsite_stream.write(code, cfg, state_id, [src_node, dst_node])
+        else:
+            # Fallback
+            self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream)
+
+    #############################################################################
+    # Predicates for Dispatcher
+
+    def state_dispatch_predicate(self, sdfg, state):
+        """
+        Determines whether a given state should be processed by this
+        code generator (`ExperimentalCUDACodeGen`).
+
+        Returns True if either:
+            1. The state has associated GPU memory that needs to be released
+               (i.e., it appears in `self.pool_release`), or
+            2. The code generator is currently generating device/kernel code.
+        """
+        return any(s is state for s, _ in self.pool_release.values()) or self._in_device_code
+
+    def node_dispatch_predicate(self, sdfg, state, node):
+        """
+        Determines whether a node should be handled by this
+        code generator (`ExperimentalCUDACodeGen`).
+
+        Returns True if:
+        - The node has a GPU schedule handled by this backend, or
+        - The generator is currently generating kernel code.
+        """
+        schedule = getattr(node, 'schedule', None)
+
+        if schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN:
+            return True
+
+        if self._in_device_code:
+            return True
+
+        return False
+
+    #############################################################################
+    # Nested SDFGs & tasklets
+
+    def generate_state(self,
+                       sdfg: SDFG,
+                       cfg: ControlFlowRegion,
+                       state: SDFGState,
+                       function_stream: CodeIOStream,
+                       callsite_stream: CodeIOStream,
+                       generate_state_footer: bool = False) -> None:
+
+        # User frame code  to generate state
+        self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream)
+
+        # Special: Release of pooled memory if not in device code that need to be released her
+        if not self._in_device_code:
+
+            handled_keys = set()
+            backend = self.backend
+            for (pool_sdfg, name), (pool_state, _) in self.pool_release.items():
+
+                if (pool_sdfg is not sdfg) or (pool_state is not state):
+                    continue
+
+                data_descriptor = pool_sdfg.arrays[name]
+                ptrname = ptr(name, data_descriptor, pool_sdfg, self._frame)
+
+                # Adjust if there is an offset
+                if isinstance(data_descriptor, dt.Array) and data_descriptor.start_offset != 0:
+                    ptrname = f'({ptrname} - {sym2cpp(data_descriptor.start_offset)})'
+
+                # Free the memory
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', pool_sdfg)
+                callsite_stream.write(generate_sync_debug_call())
+
+                # We handled the key (pool_sdfg, name) and can remove it later
+                handled_keys.add((pool_sdfg, name))
+
+            # Delete the handled keys here (not in the for loop, which would cause issues)
+            for key in handled_keys:
+                del self.pool_release[key]
+
+        # Invoke all instrumentation providers
+        for instr in self._frame._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_state_end(sdfg, cfg, state, callsite_stream, function_stream)
+
+    def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node,
+                      function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+
+        # get the generating function's name
+        gen = getattr(self, '_generate_' + type(node).__name__, False)
+
+        # if it is not implemented, use generate node of cpu impl
+        if gen is not False:
+            gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream)
+        elif type(node).__name__ == 'MapExit' and node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN:
+            # Special case: It is a MapExit but from a GPU_schedule- the MapExit is already
+            # handled by a KernelScopeManager instance. Otherwise cpu_codegen will close it
+            return
+        else:
+            self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream)
+
+    def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label):
+        return 'DACE_DFI ' + self._cpu_codegen.generate_nsdfg_header(
+            sdfg, cfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False)
+
+    def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label):
+        return self._cpu_codegen.generate_nsdfg_call(sdfg,
+                                                     cfg,
+                                                     state,
+                                                     node,
+                                                     memlet_references,
+                                                     sdfg_label,
+                                                     state_struct=False)
+
+    def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node):
+        args = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node)
+        return args
+
+    def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                             node: nodes.NestedSDFG, function_stream: CodeIOStream,
+                             callsite_stream: CodeIOStream) -> None:
+        old_schedule = self._toplevel_schedule
+        self._toplevel_schedule = node.schedule
+        old_codegen = self._cpu_codegen.calling_codegen
+        self._cpu_codegen.calling_codegen = self
+
+        # Determine and update ctype of new constant data and symbols within the NSDFG
+        parent_state: SDFGState = cfg.state(state_id)
+        nsdfg = node.sdfg
+
+        # New scope for defined variables
+        dispatcher: TargetDispatcher = self._dispatcher
+        dispatcher.defined_vars.enter_scope(node)
+
+        # Add the const qualifier to any constants not marked as such
+        """
+        # update const data
+        new_const_data = sdutil.get_constant_data(node, nsdfg)
+        for name in new_const_data:
+            desc = nsdfg.arrays[name]
+            ptr_name = ptr(name, desc, nsdfg, self._frame)
+            try:
+                defined_type, ctype = dispatcher.defined_vars.get(ptr_name, is_global=True)
+            except:
+                defined_type = get_defined_type(desc)
+                if defined_type == DefinedType.Pointer:
+                    ctype = f'{desc.ctype} *'
+                elif defined_type == DefinedType.Scalar:
+                     ctype = desc.ctype
+                else:
+                    raise NotImplementedError("Not expected Type")
+
+            if not "const " in ctype:
+                ctype = f"const {ctype}"
+            dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True)
+
+        # update const symbols
+        new_const_symbols = sdutil.get_constant_symbols(node, nsdfg)
+        for name in new_const_symbols:
+            defined_type = DefinedType.Scalar
+            if not "const" in nsdfg.symbols[name].ctype:
+                ctype = f"const {nsdfg.symbols[name].ctype}"
+        """
+
+        # Redirect rest to CPU codegen
+        self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream)
+
+        # Exit scope
+        dispatcher.defined_vars.exit_scope(node)
+
+        self._cpu_codegen.calling_codegen = old_codegen
+        self._toplevel_schedule = old_schedule
+
+    def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                          node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+        # import ScopeManager which opens and closes brackets for conditions, useful here
+        # because the location dictionary might prescribe which threads/blocks run tasklet code
+        from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import ScopeManager
+
+        tasklet: nodes.Tasklet = node
+        with ScopeManager(self, sdfg, cfg, dfg, state_id, function_stream, callsite_stream, brackets_on_enter=False) as scope_manager:
+
+            if 'gpu_thread' in tasklet.location:
+                name = 'gpu_thread'
+                index_expr = self._get_thread_id()
+                location: Union[int, str, subsets.Range] = tasklet.location[name]
+                cond = self._generate_condition_from_location(name, index_expr, location)
+                scope_manager.open(condition=cond)
+
+            if 'gpu_warp' in tasklet.location:
+                name = 'gpu_warp'
+                index_expr = self._get_warp_id()
+                location: Union[int, str, subsets.Range] = tasklet.location[name]
+                cond = self._generate_condition_from_location(name, index_expr, location)
+                scope_manager.open(condition=cond)
+
+            if 'gpu_block' in tasklet.location:
+                name = 'gpu_block'
+                index_expr = self._get_block_id()
+                location: Union[int, str, subsets.Range] = tasklet.location[name]
+                cond = self._generate_condition_from_location(name, index_expr, location)
+                scope_manager.open(condition=cond)
+
+            # Call CPU codegen
+            self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream)
+
+    def _generate_condition_from_location(self, name: str, index_expr: str, location: Union[int, str,
+                                                                                            subsets.Range]) -> str:
+
+        # 1. Normalize location
+        if isinstance(location, str) and ':' in location:
+            location = subsets.Range.from_string(location)
+            if len(location) != 1:
+                raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given')
+        elif symbolic.issymbolic(location):
+            location = sym2cpp(location)
+
+        # 2. Build condition
+        if isinstance(location, subsets.Range):
+            # Range of indices
+            begin, end, stride = location[0]
+            rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride)
+            cond = f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})'
+            if stride != 1:
+                cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)'
+        else:
+            # Single-element
+            cond = f'({index_expr}) == {location}'
+
+        return cond
+
+    def _get_thread_id(self) -> str:
+        kernel_block_dims: List = self._current_kernel_spec.block_dims
+        result = 'threadIdx.x'
+        if kernel_block_dims[1] != 1:
+            result += f' + ({sym2cpp(kernel_block_dims[0])}) * threadIdx.y'
+        if kernel_block_dims[2] != 1:
+            result += f' + ({sym2cpp(kernel_block_dims[0] * kernel_block_dims[1])}) * threadIdx.z'
+        return result
+
+    def _get_warp_id(self) -> str:
+        return f'(({self._get_thread_id()}) / warpSize)'
+
+    def _get_block_id(self) -> str:
+        kernel_block_dims: List = self._current_kernel_spec.block_dims
+        result = 'blockIdx.x'
+        if kernel_block_dims[1] != 1:
+            result += f' + gridDim.x * blockIdx.y'
+        if kernel_block_dims[2] != 1:
+            result += f' + gridDim.x * gridDim.y * blockIdx.z'
+        return result
+
+    #######################################################################
+    # Array Declaration, Allocation and Deallocation
+
+    def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                      node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
+                      declaration_stream: CodeIOStream) -> None:
+
+        ptrname = ptr(node.data, nodedesc, sdfg, self._frame)
+        fsymbols = self._frame.symbols_and_constants(sdfg)
+
+        # ----------------- Guard checks --------------------
+
+        # NOTE: `dfg` is None iff `nodedesc` is non-free symbol dependent (see DaCeCodeGenerator.determine_allocation_lifetime).
+        # We avoid `is_nonfree_sym_dependent` when dfg is None and `nodedesc` is a View.
+        if dfg and not sdutil.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols):
+            raise NotImplementedError(
+                "declare_array is only for variables that require separate declaration and allocation.")
+
+        if nodedesc.storage == dtypes.StorageType.GPU_Shared:
+            raise NotImplementedError("Dynamic shared memory unsupported")
+
+        if nodedesc.storage == dtypes.StorageType.Register:
+            raise ValueError("Dynamic allocation of registers is not allowed")
+
+        if nodedesc.storage not in {dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned}:
+            raise NotImplementedError(f"CUDA: Unimplemented storage type {nodedesc.storage.name}.")
+
+        if self._dispatcher.declared_arrays.has(ptrname):
+            return  # Already declared
+
+        # ----------------- Declaration --------------------
+        dataname = node.data
+        array_ctype = f'{nodedesc.dtype.ctype} *'
+        declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node)
+        self._dispatcher.declared_arrays.add(dataname, DefinedType.Pointer, array_ctype)
+
+    def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                       node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
+                       declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None:
+        """
+        Maybe document here that this also does declaration and that declare_array only declares specific
+        kind of data
+        """
+
+        dataname = ptr(node.data, nodedesc, sdfg, self._frame)
+
+        # ------------- Guard checks & Redirect to CPU CodeGen -------------
+
+        # Skip if variable is already defined
+        if self._dispatcher.defined_vars.has(dataname):
+            return
+
+        if isinstance(nodedesc, dace.data.Stream):
+            raise NotImplementedError("allocate_stream not implemented in ExperimentalCUDACodeGen")
+
+        elif isinstance(nodedesc, dace.data.View):
+            return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream,
+                                                   allocation_stream)
+        elif isinstance(nodedesc, dace.data.Reference):
+            return self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream,
+                                                        declaration_stream, allocation_stream)
+
+        # No clue what is happening here
+        if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
+            nodedesc = update_persistent_desc(nodedesc, sdfg)
+
+        # NOTE: Experimental for GPU stream
+        if nodedesc.dtype == dtypes.gpuStream_t:
+            return
+
+        # ------------------- Allocation/Declaration -------------------
+
+        # Call the appropriate handler based on storage type
+        gen = getattr(self, f'_prepare_{nodedesc.storage.name}_array', None)
+        if gen:
+            gen(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream)
+        else:
+            raise NotImplementedError(f'CUDA: Unimplemented storage type {nodedesc.storage}')
+
+    def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                                  node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
+                                  declaration_stream: CodeIOStream, allocation_stream: CodeIOStream):
+        dataname = ptr(node.data, nodedesc, sdfg, self._frame)
+
+        # ------------------- Declaration -------------------
+        declared = self._dispatcher.declared_arrays.has(dataname)
+
+        if not declared:
+            array_ctype = f'{nodedesc.dtype.ctype} *'
+            declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node)
+            self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype)
+
+        # ------------------- Allocation -------------------
+        arrsize = nodedesc.total_size
+        arrsize_malloc = f'{sym2cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})'
+
+        if nodedesc.pool:
+            gpu_stream_manager = self._gpu_stream_manager
+            gpu_stream = gpu_stream_manager.get_stream_node(node)
+            allocation_stream.write(
+                f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {gpu_stream}));\n',
+                cfg, state_id, node)
+
+            # Generate synchronization and error-check calls if sync debugging is enabled
+            allocation_stream.write(generate_sync_debug_call())
+
+        else:
+            # Strides are left to the user's discretion
+            allocation_stream.write(f'DACE_GPU_CHECK({self.backend}Malloc((void**)&{dataname}, {arrsize_malloc}));\n',
+                                    cfg, state_id, node)
+
+        # ------------------- Initialization -------------------
+        if node.setzero:
+            allocation_stream.write(f'DACE_GPU_CHECK({self.backend}Memset({dataname}, 0, {arrsize_malloc}));\n', cfg,
+                                    state_id, node)
+
+        if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0:
+            allocation_stream.write(f'{dataname} += {sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node)
+
+    def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                                  node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
+                                  declaration_stream: CodeIOStream, allocation_stream: CodeIOStream):
+
+        dataname = ptr(node.data, nodedesc, sdfg, self._frame)
+
+        # ------------------- Declaration -------------------
+        declared = self._dispatcher.declared_arrays.has(dataname)
+
+        if not declared:
+            array_ctype = f'{nodedesc.dtype.ctype} *'
+            declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node)
+            self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype)
+
+        # ------------------- Allocation -------------------
+        arrsize = nodedesc.total_size
+        arrsize_malloc = f'{sym2cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})'
+
+        # Strides are left to the user's discretion
+        allocation_stream.write(f'DACE_GPU_CHECK({self.backend}MallocHost(&{dataname}, {arrsize_malloc}));\n', cfg,
+                                state_id, node)
+        if node.setzero:
+            allocation_stream.write(f'memset({dataname}, 0, {arrsize_malloc});\n', cfg, state_id, node)
+
+        if nodedesc.start_offset != 0:
+            allocation_stream.write(f'{dataname} += {sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node)
+
+    def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                                  node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
+                                  declaration_stream: CodeIOStream, allocation_stream: CodeIOStream):
+
+        dataname = ptr(node.data, nodedesc, sdfg, self._frame)
+        arrsize = nodedesc.total_size
+
+        # ------------------- Guard checks -------------------
+        if symbolic.issymbolic(arrsize, sdfg.constants):
+            raise NotImplementedError('Dynamic shared memory unsupported')
+        if nodedesc.start_offset != 0:
+            raise NotImplementedError('Start offset unsupported for shared memory')
+
+        # ------------------- Declaration -------------------
+        array_ctype = f'{nodedesc.dtype.ctype} *'
+
+        declaration_stream.write(f'__shared__ {nodedesc.dtype.ctype} {dataname}[{sym2cpp(arrsize)}];\n', cfg, state_id,
+                                 node)
+
+        self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype)
+
+        # ------------------- Initialization -------------------
+        if node.setzero:
+            allocation_stream.write(
+                f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(sym2cpp(self._current_kernel_spec.block_dims))}, {sym2cpp(arrsize)}, '
+                f'1, false>::Reset({dataname});\n', cfg, state_id, node)
+
+    def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                                node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
+                                declaration_stream: CodeIOStream, allocation_stream: CodeIOStream):
+
+        dataname = ptr(node.data, nodedesc, sdfg, self._frame)
+
+        # ------------------- Guard checks -------------------
+        if symbolic.issymbolic(arrsize, sdfg.constants):
+            raise ValueError('Dynamic allocation of registers not allowed')
+        if nodedesc.start_offset != 0:
+            raise NotImplementedError('Start offset unsupported for registers')
+
+        # ------------------- Declaration & Initialization -------------------
+        arrsize = nodedesc.total_size
+        array_ctype = '{nodedesc.dtype.ctype} *'
+        init_clause = ' = {0}' if node.setzero else ''
+
+        declaration_stream.write(f'{nodedesc.dtype.ctype} {dataname}[{sym2cpp(arrsize)}]{init_clause};\n', cfg,
+                                 state_id, node)
+
+        self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype)
+
+    def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
+                         node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
+                         callsite_stream: CodeIOStream) -> None:
+
+        dataname = ptr(node.data, nodedesc, sdfg, self._frame)
+
+        # Adjust offset if needed
+        if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0:
+            dataname = f'({dataname} - {sym2cpp(nodedesc.start_offset)})'
+
+        # Remove declaration info
+        if self._dispatcher.declared_arrays.has(dataname):
+            is_global = nodedesc.lifetime in (
+                dtypes.AllocationLifetime.Global,
+                dtypes.AllocationLifetime.Persistent,
+                dtypes.AllocationLifetime.External,
+            )
+            self._dispatcher.declared_arrays.remove(dataname, is_global=is_global)
+
+        # Special case: Stream
+        if isinstance(nodedesc, dace.data.Stream):
+            raise NotImplementedError('stream code is not implemented in ExperimentalCUDACodeGen (yet)')
+
+        # Special case: View - no deallocation
+        if isinstance(nodedesc, dace.data.View):
+            return
+
+        # Main deallocation logic by storage type
+        if nodedesc.storage == dtypes.StorageType.GPU_Global:
+            if nodedesc.pool:
+                if (sdfg, dataname) not in self.pool_release:
+                    gpu_stream = self._gpu_stream_manager.get_stream_node(node)
+                    callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeAsync({dataname}, {gpu_stream}));\n', cfg,
+                                          state_id, node)
+            else:
+                callsite_stream.write(f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', cfg, state_id, node)
+
+        elif nodedesc.storage == dtypes.StorageType.CPU_Pinned:
+            if nodedesc.dtype == dtypes.gpuStream_t:
+                return
+            callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeHost({dataname}));\n', cfg, state_id, node)
+
+        elif nodedesc.storage in {dtypes.StorageType.GPU_Shared, dtypes.StorageType.Register}:
+            # No deallocation needed
+            return
+
+        else:
+            raise NotImplementedError(f'Deallocation not implemented for storage type: {nodedesc.storage.name}')
+
+    def get_generated_codeobjects(self):
+        fileheader = CodeIOStream()
+
+        self._frame.generate_fileheader(self._global_sdfg, fileheader, 'cuda')
+
+        # The GPU stream array is set to have a persistent allocation lifetime (see preprocess GPU stream pipeline).
+        # Thus the definition of the GPU stream array in the state struct and the access to it is handled elsewhere and
+        # in several different files (e.g., framecode.py, cpu.py, cpp.py). For the sake of consistency, we initialize it
+        # as it is expected in the other modules. I.e. prepend with an ID for all SDFGs it is defined.
+        # Note that all the different variable names point to the same GPU stream array.
+        cnt = 0
+        init_gpu_stream_vars = ""
+        gpu_stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(",")[0]
+        for csdfg, name, desc in self._global_sdfg.arrays_recursive(include_nested_data=True):
+            if name == gpu_stream_array_name and desc.lifetime == dtypes.AllocationLifetime.Persistent:
+                init_gpu_stream_vars = f"__state->__{csdfg.cfg_id}_{name}"
+                break
+
+        # My comment: takes codeblocks and transforms it nicely to code
+        initcode = CodeIOStream()
+        for sd in self._global_sdfg.all_sdfgs_recursive():
+            if None in sd.init_code:
+                initcode.write(codeblock_to_cpp(sd.init_code[None]), sd)
+            if 'cuda' in sd.init_code:
+                initcode.write(codeblock_to_cpp(sd.init_code['cuda']), sd)
+        initcode.write(self._initcode.getvalue())
+
+        exitcode = CodeIOStream()
+        for sd in self._global_sdfg.all_sdfgs_recursive():
+            if None in sd.exit_code:
+                exitcode.write(codeblock_to_cpp(sd.exit_code[None]), sd)
+            if 'cuda' in sd.exit_code:
+                exitcode.write(codeblock_to_cpp(sd.exit_code['cuda']), sd)
+        exitcode.write(self._exitcode.getvalue())
+
+        if self.backend == 'cuda':
+            backend_header = 'cuda_runtime.h'
+        elif self.backend == 'hip':
+            backend_header = 'hip/hip_runtime.h'
+        else:
+            raise NameError('GPU backend "%s" not recognized' % self.backend)
+
+        params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg))
+        if params_comma:
+            params_comma = ', ' + params_comma
+
+        pool_header = ''
+        if self.has_pool:
+            poolcfg = Config.get('compiler', 'cuda', 'mempool_release_threshold')
+            pool_header = f'''
+    cudaMemPool_t mempool;
+    cudaDeviceGetDefaultMemPool(&mempool, 0);
+    uint64_t threshold = {poolcfg if poolcfg != -1 else 'UINT64_MAX'};
+    cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
+'''
+
+        self._codeobject.code = """
+#include <{backend_header}>
+#include <dace/dace.h>
+
+{file_header}
+
+DACE_EXPORTED int __dace_init_experimental_cuda({sdfg_state_name} *__state{params});
+DACE_EXPORTED int __dace_exit_experimental_cuda({sdfg_state_name} *__state);
+
+{other_globalcode}
+
+int __dace_init_experimental_cuda({sdfg_state_name} *__state{params}) {{
+    int count;
+
+    // Check that we are able to run {backend} code
+    if ({backend}GetDeviceCount(&count) != {backend}Success)
+    {{
+        printf("ERROR: GPU drivers are not configured or {backend}-capable device "
+               "not found\\n");
+        return 1;
+    }}
+    if (count == 0)
+    {{
+        printf("ERROR: No {backend}-capable devices found\\n");
+        return 2;
+    }}
+
+    // Initialize {backend} before we run the application
+    float *dev_X;
+    DACE_GPU_CHECK({backend}Malloc((void **) &dev_X, 1));
+    DACE_GPU_CHECK({backend}Free(dev_X));
+
+    {pool_header}
+
+    __state->gpu_context = new dace::cuda::Context({nstreams}, {nevents});
+
+    // Create {backend} streams and events
+    for(int i = 0; i < {nstreams}; ++i) {{
+        DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->internal_streams[i], {backend}StreamNonBlocking));
+        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams
+    }}
+    for(int i = 0; i < {nevents}; ++i) {{
+        DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming));
+    }}
+
+    {initcode}
+
+    return 0;
+}}
+
+int __dace_exit_experimental_cuda({sdfg_state_name} *__state) {{
+    {exitcode}
+
+    // Synchronize and check for CUDA errors
+    int __err = static_cast<int>(__state->gpu_context->lasterror);
+    if (__err == 0)
+        __err = static_cast<int>({backend}DeviceSynchronize());
+
+    // Destroy {backend} streams and events
+    for(int i = 0; i < {nstreams}; ++i) {{
+        DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->internal_streams[i]));
+    }}
+    for(int i = 0; i < {nevents}; ++i) {{
+        DACE_GPU_CHECK({backend}EventDestroy(__state->gpu_context->events[i]));
+    }}
+
+    delete __state->gpu_context;
+    return __err;
+}}
+
+
+{localcode}
+""".format(params=params_comma,
+           sdfg_state_name=mangle_dace_state_struct_name(self._global_sdfg),
+           initcode=initcode.getvalue(),
+           exitcode=exitcode.getvalue(),
+           other_globalcode=self._globalcode.getvalue(),
+           localcode=self._localcode.getvalue(),
+           file_header=fileheader.getvalue(),
+           nstreams=self._gpu_stream_manager.num_gpu_streams,
+           nevents=self._gpu_stream_manager.num_gpu_events,
+           backend=self.backend,
+           backend_header=backend_header,
+           pool_header=pool_header,
+           sdfg=self._global_sdfg)
+
+        return [self._codeobject]
+
+    #######################################################################
+    # Compilation Related
+
+    @staticmethod
+    def cmake_options():
+        options = []
+
+        # Override CUDA toolkit
+        if Config.get('compiler', 'cuda', 'path'):
+            options.append("-DCUDA_TOOLKIT_ROOT_DIR=\"{}\"".format(
+                Config.get('compiler', 'cuda', 'path').replace('\\', '/')))
+
+        # Get CUDA architectures from configuration
+        backend = common.get_gpu_backend()
+        if backend == 'cuda':
+            cuda_arch = Config.get('compiler', 'cuda', 'cuda_arch').split(',')
+            cuda_arch = [ca for ca in cuda_arch if ca is not None and len(ca) > 0]
+
+            cuda_arch = ';'.join(cuda_arch)
+            options.append(f'-DDACE_CUDA_ARCHITECTURES_DEFAULT="{cuda_arch}"')
+
+            flags = Config.get("compiler", "cuda", "args")
+            options.append("-DCMAKE_CUDA_FLAGS=\"{}\"".format(flags))
+
+        if backend == 'hip':
+            hip_arch = Config.get('compiler', 'cuda', 'hip_arch').split(',')
+            hip_arch = [ha for ha in hip_arch if ha is not None and len(ha) > 0]
+
+            flags = Config.get("compiler", "cuda", "hip_args")
+            flags += " -G -g"
+            flags += ' ' + ' '.join(
+                '--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch)
+                for arch in hip_arch)
+            options.append("-DEXTRA_HIP_FLAGS=\"{}\"".format(flags))
+
+        if Config.get('compiler', 'cpu', 'executable'):
+            host_compiler = make_absolute(Config.get("compiler", "cpu", "executable"))
+            options.append("-DCUDA_HOST_COMPILER=\"{}\"".format(host_compiler))
+
+        return options
+
+    #######################################################################
+    # Callback to CPU codegen
+
+    def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int,
+                          src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet],
+                          function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+        self._cpu_codegen.define_out_memlet(sdfg, cfg, state_dfg, state_id, src_node, dst_node, edge, function_stream,
+                                            callsite_stream)
+
+    def process_out_memlets(self, *args, **kwargs):
+        # Call CPU implementation with this code generator as callback
+        self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs)
+
+
+#########################################################################
+# helper class
+# This one is closely linked to the ExperimentalCUDACodeGen. In fact,
+# it only exists to not have to much attributes and methods in the ExperimentalCUDACodeGen
+# and to group Kernel specific methods & information. Thus, KernelSpec should remain in this file
+class KernelSpec:
+    """
+    A helper class to encapsulate information required for working with kernels.
+    This class provides a structured way to store and retrieve kernel parameters.
+    """
+
+    def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: ControlFlowRegion,
+                 dfg_scope: ScopeSubgraphView, state_id: int):
+
+        # Get kernel entry/exit nodes and current state
+        kernel_map_entry: nodes.MapEntry = dfg_scope.source_nodes()[0]
+        kernel_parent_state: SDFGState = cfg.state(state_id)
+
+        self._kernel_map_entry: nodes.MapEntry = kernel_map_entry
+        self._kernels_state: SDFGState = kernel_parent_state
+
+        # Kernel name
+        self._kernel_name: str = f'{kernel_map_entry.map.label}_{cfg.cfg_id}_{kernel_parent_state.block_id}_{kernel_parent_state.node_id(kernel_map_entry)}'
+
+        # Get and store kernel constants — needed for applying 'const' and updating defined
+        # constant variable types in the dispatcher (handled at GPU codegen)
+        kernel_const_data = sdutil.get_constant_data(kernel_map_entry, kernel_parent_state)
+        kernel_const_symbols = sdutil.get_constant_symbols(kernel_map_entry, kernel_parent_state)
+        kernel_constants = kernel_const_data | kernel_const_symbols
+        self._kernel_constants: Set[str] = kernel_constants
+
+        arglist: Dict[str, dt.Data] = cudaCodeGen._kernel_arglists[kernel_map_entry]
+        self._arglist = arglist
+
+        # save _in_device_code value for restoring later
+        restore_in_device_code = cudaCodeGen._in_device_code
+
+        # Certain args are called in the CUDA/HIP file or kernel funcion, in which the pointer name of the args are different
+        cudaCodeGen._in_device_code = True
+        self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()]
+
+        # Special: Persistent arguments
+        args_typed = []
+        for name, data in arglist.items():
+            if data.lifetime == dtypes.AllocationLifetime.Persistent:
+                arg_name = ptr(name, data, sdfg, cudaCodeGen._frame)
+            else:
+                arg_name = name
+            args_typed.append(('const ' if name in kernel_constants else '') + data.as_arg(name=arg_name))
+
+        self._args_typed = args_typed
+
+        # Args for the kernel wrapper function
+        cudaCodeGen._in_device_code = False
+
+        # Gather GPU stream information:
+        # - Use the connector name when passing the stream to the kernel
+        # - Use the configured variable name (from Config) in the wrapper’s function signature
+        #   (this same name is also used when invoking {backend}LaunchKernel inside the wrapper)
+        gpustream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
+        gpustream_input = [
+            e for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry)
+            if e.src.desc(sdfg).dtype == dtypes.gpuStream_t
+        ]
+        if len(gpustream_input) > 1:
+            raise ValueError(
+                f"There can not be more than one GPU stream assigned to a kernel, but {len(gpustream_input)} were assigned."
+            )
+
+        # Final wrapper arguments:
+        # - State struct (__state)
+        # - Original kernel args
+        # - GPU stream
+        self._kernel_wrapper_args_as_input = (
+            ['__state'] + [ptr(name, data, sdfg, cudaCodeGen._frame)
+                           for name, data in arglist.items()] + [str(gpustream_input[0].dst_conn)])
+
+        self._kernel_wrapper_args_typed = ([f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] +
+                                           args_typed + [f"gpuStream_t {gpustream_var_name}"])
+
+        cudaCodeGen._in_device_code = restore_in_device_code
+
+        # The kernel's grid and block dimensions
+        self._grid_dims, self._block_dims = cudaCodeGen._kernel_dimensions_map[kernel_map_entry]
+
+        # C type of block, thread, and warp indices (as a string)
+        self._gpu_index_ctype: str = self.get_gpu_index_ctype()
+
+        # Warp size (backend-dependent)
+        if cudaCodeGen.backend not in ['cuda', 'hip']:
+            raise ValueError(f"Unsupported backend '{cudaCodeGen.backend}' in ExperimentalCUDACodeGen. "
+                             "Only 'cuda' and 'hip' are supported.")
+
+        warp_size_key = 'cuda_warp_size' if cudaCodeGen.backend == 'cuda' else 'hip_warp_size'
+        self._warpSize = Config.get('compiler', 'cuda', warp_size_key)
+
+    def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str:
+        """
+        Retrieves the GPU index data type as a C type string (for thread, block, warp indices)
+        from the configuration and if it matches a DaCe data type.
+
+        Raises:
+            ValueError: If the configured type does not match a DaCe data type.
+
+        Returns:
+            str:
+                The C type string corresponding to the configured GPU index type.
+                Used for defining thread, block, and warp indices in the generated code.
+        """
+        type_name = Config.get('compiler', 'cuda', config_key)
+        dtype = getattr(dtypes, type_name, None)
+        if not isinstance(dtype, dtypes.typeclass):
+            raise ValueError(
+                f'Invalid {config_key} "{type_name}" configured (used for thread, block, and warp indices): '
+                'no matching DaCe data type found.\n'
+                'Please use a valid type from dace.dtypes (e.g., "int32", "uint64").')
+        return dtype.ctype
+
+    @property
+    def kernel_constants(self) -> Set[str]:
+        """Returns the kernel's constant data and symbols."""
+        return self._kernel_constants
+
+    @property
+    def kernel_name(self) -> list[str]:
+        """Returns the kernel (function's) name."""
+        return self._kernel_name
+
+    @property
+    def kernel_map_entry(self) -> nodes.MapEntry:
+        """
+        Returns the entry node of the kernel, which is a MapEntry node
+        scheduled with dace.dtypes.ScheduleType.GPU_Device.
+        """
+        return self._kernel_map_entry
+
+    @property
+    def kernel_map(self) -> nodes.Map:
+        """Returns the kernel's map node."""
+        return self._kernel_map_entry.map
+
+    @property
+    def arglist(self) -> Dict[str, dt.Data]:
+        """
+        Returns a dictionary of arguments for the kernel's subgraph,
+        mapping each data name to its corresponding data descriptor.
+        """
+        return self._arglist
+
+    @property
+    def args_as_input(self) -> list[str]:
+        """
+        Returns the kernel function arguments formatted for use as inputs
+        when calling/launching the kernel function.
+        """
+        return self._args_as_input
+
+    @property
+    def args_typed(self) -> list[str]:
+        """
+        Returns the typed kernel function arguments suitable for declaring
+        the kernel function. Each argument includes its corresponding data type.
+        """
+        return self._args_typed
+
+    @property
+    def kernel_wrapper_args_as_input(self) -> list[str]:
+        """
+        Returns the argument names passed to the kernel wrapper function.
+
+        The kernel wrapper is a function defined in the CUDA/HIP code that is called
+        from the CPU code and is responsible for launching the kernel function.
+        """
+        return self._kernel_wrapper_args_as_input
+
+    @property
+    def kernel_wrapper_args_typed(self) -> list[str]:
+        """
+        Returns the typed arguments used to declare the kernel wrapper function.
+
+        The kernel wrapper is defined in the CUDA/HIP code, called from the CPU side,
+        and is responsible for launching the actual kernel function.
+        """
+        return self._kernel_wrapper_args_typed
+
+    @property
+    def grid_dims(self) -> list:
+        """Returns the grid dimensions of the kernel."""
+        return self._grid_dims
+
+    @property
+    def block_dims(self) -> list:
+        """Returns the block dimensions of the kernel."""
+        return self._block_dims
+
+    @property
+    def warpSize(self) -> int:
+        """
+        Returns the warp size used in this kernel.
+        This value depends on the selected backend (CUDA or HIP)
+        and is retrieved from the configuration.
+        """
+        return self._warpSize
+
+    @property
+    def gpu_index_ctype(self) -> str:
+        """
+        Returns the C data type used for GPU indices (thread, block, warp)
+        in generated code. This type is determined by the 'gpu_index_type'
+        setting in the configuration and matches with a DaCe typeclass.
+        """
+        return self._gpu_index_ctype
diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py
new file mode 100644
index 0000000000..3982f3a86d
--- /dev/null
+++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py
@@ -0,0 +1,756 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from dace import SDFG, SDFGState, data, dtypes, subsets
+from dace import memlet as mm
+from dace import symbolic
+from dace.codegen import common
+from dace.codegen.targets import cpp
+from dace.codegen.targets.cpp import sym2cpp, unparse_cr
+from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import generate_sync_debug_call
+from dace.config import Config
+from dace.dtypes import StorageType
+from dace.frontend import operations
+from dace.sdfg import nodes, scope_contains_scope
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.transformation import helpers
+
+
+class CopyContext:
+    """
+    Encapsulates inputs required for copy operations and exposes helper
+    methods to derive additional information. This keeps copy strategies
+    lightweight by letting them focus only on the relevant logic.
+    """
+
+    def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node,
+                 edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]):
+
+        # Store the basic context as attributes
+        self.sdfg = sdfg
+        self.state = state
+        self.src_node = src_node
+        self.dst_node = dst_node
+        self.edge = edge
+        self.gpustream_assignments = gpustream_assignments
+
+        memlet = edge.data
+
+        self.copy_shape = memlet.subset.size_exact()
+        if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode):
+            copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info()
+        else:
+            copy_shape = memlet.subset.size_exact()
+            src_strides = dst_strides = src_expr = dst_expr = None
+
+        self.copy_shape = copy_shape
+        self.src_strides = src_strides
+        self.dst_strides = dst_strides
+        self.src_expr = src_expr
+        self.dst_expr = dst_expr
+
+    def get_storage_type(self, node: nodes.Node):
+        """
+        Return the storage type associated with a given SDFG node.
+
+        Tasklets are assumed to use register storage, while AccessNodes
+        return the storage type from their data descriptor. Raises
+        NotImplementedError for unsupported node types.
+        """
+        if isinstance(node, nodes.Tasklet):
+            storage_type = StorageType.Register
+
+        elif isinstance(node, nodes.AccessNode):
+            storage_type = node.desc(self.sdfg).storage
+
+        else:
+            raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; "
+                                      "expected AccessNode or Tasklet. Please extend this method accordingly.")
+
+        return storage_type
+
+    def get_assigned_gpustream(self) -> str:
+        """
+        Return the GPU stream expression assigned to both source and destination nodes.
+
+        Ensures that both nodes have a matching stream ID, then constructs the
+        variable name from the configured prefix and stream ID. Raises ValueError
+        if assignments are missing or inconsistent.
+
+        Example:
+            If the configured prefix is 'gpu_stream' and the assigned stream ID is 0,
+            this method returns 'gpu_stream0'.
+        """
+        src_stream = self.gpustream_assignments.get(self.src_node)
+        dst_stream = self.gpustream_assignments.get(self.dst_node)
+
+        # 1. Catch unsupported cases
+        if src_stream is None or dst_stream is None:
+            raise ValueError("GPU stream assignment missing for source or destination node.")
+
+        if src_stream != dst_stream:
+            raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', "
+                             f"dst_node has '{dst_stream}'. They must be the same.")
+
+        # 2. Generate GPU stream expression
+        gpustream = src_stream
+        gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
+        gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}"
+
+        return gpustream_expr
+
+    def get_memory_location(self) -> Tuple[str, str]:
+        """
+        Determine whether the source and destination nodes reside in device or host memory.
+
+        Uses the storage type of each node to classify it as either 'Device'
+        (GPU global memory) or 'Host' (all other storage types).
+        Used for GPU related copies outside the kernel (e.g. to construct
+        cudaMemcpyHostToDevice for example).
+
+        Returns
+        -------
+        Tuple[str, str]
+            (src_location, dst_location) where each is either 'Device' or 'Host'.
+        """
+        src_storage = self.get_storage_type(self.src_node)
+        dst_storage = self.get_storage_type(self.dst_node)
+        src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host'
+        dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host'
+
+        return src_location, dst_location
+
+    def get_ctype(self) -> Any:
+        """
+        Determine the C data type (ctype) of the source or destination node.
+
+        The ctype is resolved from the data descriptor of the first node
+        (source or destination) that is an AccessNode (assumed to be the same
+        if both are AccessNodes).
+
+        Returns
+        -------
+        Any
+            The C type string (e.g., "float*", "int32") associated with the node.
+
+        Raises
+        ------
+        NotImplementedError
+            If neither the source nor the destination node is an AccessNode.
+        """
+        sdfg = self.sdfg
+        src_node, dst_node = self.src_node, self.dst_node
+
+        if isinstance(src_node, nodes.AccessNode):
+            return src_node.desc(sdfg).ctype
+
+        if isinstance(dst_node, nodes.AccessNode):
+            return dst_node.desc(sdfg).ctype
+
+        raise NotImplementedError(
+            f"Cannot determine ctype: neither src nor dst node is an AccessNode. "
+            f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. "
+            "Please extend this case or fix the issue.")
+
+    def get_accessnode_to_accessnode_copy_info(self):
+        """
+        Compute copy shape, absolute strides, and pointer expressions for a copy
+        between two AccessNodes. Tries to mimic
+        cpp.memlet_copy_to_absolute_strides without requiring a dispatcher.
+
+        Returns
+        -------
+        (copy_shape, src_strides, dst_strides, src_expr, dst_expr)
+
+        Raises
+        ------
+        TypeError
+            If either endpoint is not an AccessNode.
+        NotImplementedError
+            If a descriptor is not Scalar or Array.
+        """
+
+        # ---------------------------- helpers ----------------------------
+        def _collapse_strides(strides, subset):
+            """Remove size-1 dims; keep tile strides; default to [1] if none remain."""
+            n = len(subset)
+            collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1]
+            collapsed.extend(strides[n:])  # include tiles
+            if len(collapsed) == 0:
+                return [1]
+            return collapsed
+
+        def _ptr_name(desc, name):
+            if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent,
+                                                    dtypes.AllocationLifetime.External):
+                return f'__state->__{sdfg.cfg_id}_{name}'
+            return name
+
+        def _expr_for(desc, name, subset):
+            ptr = _ptr_name(desc, name)
+
+            if isinstance(desc, data.Scalar):
+                # GPU scalar special-case
+                if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN:
+                    parent = state.sdfg.parent_nsdfg_node
+                    if parent is not None and name in parent.in_connectors:
+                        return f"&{ptr}"
+                    return ptr
+                # CPU (or other) scalars
+                return f"&{ptr}"
+
+            if isinstance(desc, data.Array):
+                offset = cpp.cpp_offset_expr(desc, subset)
+                return f"{ptr} + {offset}" if offset != "0" else ptr
+
+            raise NotImplementedError(
+                f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.")
+
+        # ---------------------------- Get copy info ----------------------------
+        # Get needed information
+        src_node, dst_node = self.src_node, self.dst_node
+        sdfg, edge, state = self.sdfg, self.edge, self.state
+        memlet, copy_shape = self.edge.data, self.copy_shape
+
+        # Guard - only applicable if src and dst are AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            raise TypeError(
+                f"get_accessnode_to_accessnode_copy_info requires both source and destination "
+                f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.")
+
+        # Get node descriptors
+        src_nodedesc = src_node.desc(sdfg)
+        dst_nodedesc = dst_node.desc(sdfg)
+
+        # Resolve subsets (fallback to full range)
+        src_subset = memlet.get_src_subset(edge, state)
+        dst_subset = memlet.get_dst_subset(edge, state)
+
+        if src_subset is None:
+            src_subset = subsets.Range.from_array(src_nodedesc)
+
+        if dst_subset is None:
+            dst_subset = subsets.Range.from_array(dst_nodedesc)
+
+        # Get strides
+        src_strides = src_subset.absolute_strides(src_nodedesc.strides)
+        dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides)
+
+        # Try to convert to a degenerate/strided ND copy first
+        result = cpp.ndcopy_to_strided_copy(
+            copy_shape,
+            src_nodedesc.shape,
+            src_strides,
+            dst_nodedesc.shape,
+            dst_strides,
+            memlet.subset,
+            src_subset,
+            dst_subset,
+        )
+
+        if result is not None:
+            copy_shape, src_strides, dst_strides = result
+        else:
+            src_strides = _collapse_strides(src_strides, src_subset)
+            dst_strides = _collapse_strides(dst_strides, dst_subset)
+            copy_shape = [s for s in copy_shape if s != 1] or [1]
+
+        # Extend copy shape to the largest among the data dimensions,
+        # and extend other array with the appropriate strides
+        if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape):
+            if memlet.data == src_node.data:
+                copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape)
+            elif memlet.data == dst_node.data:
+                copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape)
+
+        # Build final expressions
+        src_expr = _expr_for(src_nodedesc, src_node.data, src_subset)
+        dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset)
+
+        return copy_shape, src_strides, dst_strides, src_expr, dst_expr
+
+
+class CopyStrategy(ABC):
+    """Abstract base class for memory copy strategies."""
+
+    @abstractmethod
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Return True if this strategy can handle the given memory copy.
+        """
+        raise NotImplementedError('Abstract class')
+
+    @abstractmethod
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates and returns the copy code for the supported pattern.
+        """
+        raise NotImplementedError('Abstract class')
+
+
+class OutOfKernelCopyStrategy(CopyStrategy):
+    """
+    Copy strategy for memory transfers that occur outside of kernel execution.
+
+    This pattern often occurs when generating host-to-device copies for kernel inputs
+    (since kernels cannot access host memory directly), and device-to-host copies
+    to retrieve results for further processing.
+    """
+
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Determines whether the data movement is a host<->device memory copy.
+
+        This function returns True if:
+        - We are not currently generating kernel code
+        - The copy occurs between two AccessNodes
+        - The data descriptors of source and destination are not views.
+        - The storage types of either src or dst is CPU_Pinned or GPU_Device
+        - We do not have a CPU-to-CPU copy
+        """
+        # Retrieve needed information
+        state = copy_context.state
+        src_node, dst_node = copy_context.src_node, copy_context.dst_node
+
+        # 1. Ensure copy is not occuring within a kernel
+        scope_dict = state.scope_dict()
+        deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node
+
+        parent_map_tuple = helpers.get_parent_map(state, deeper_node)
+        while parent_map_tuple is not None:
+            parent_map, parent_state = parent_map_tuple
+            if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN:
+                return False
+            else:
+                parent_map_tuple = helpers.get_parent_map(parent_state, parent_map)
+
+        # 2. Check whether copy is between two AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            return False
+
+        # 3. The data descriptors of source and destination are not views
+        if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View):
+            return False
+
+        # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device
+        src_storage = copy_context.get_storage_type(src_node)
+        dst_storage = copy_context.get_storage_type(dst_node)
+        if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)
+                or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)):
+            return False
+
+        # 5. Check that this is not a CPU to CPU copy
+        cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned]
+        if src_storage in cpu_storage_types and dst_storage in cpu_storage_types:
+            return False
+
+        return True
+
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """Execute host-device copy with CUDA memory operations"""
+
+        # Guard
+        memlet = copy_context.edge.data
+        if memlet.wcr is not None:
+            src_location, dst_location = copy_context.get_memory_location()
+            raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented')
+
+        # Based on the copy dimension, call appropiate helper function
+        num_dims = len(copy_context.copy_shape)
+        if num_dims == 1:
+            copy_call = self._generate_1d_copy(copy_context)
+
+        elif num_dims == 2:
+            copy_call = self._generate_2d_copy(copy_context)
+
+        else:
+            # sanity check
+            assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}."
+            copy_call = self._generate_nd_copy(copy_context)
+
+        return copy_call
+
+    def _generate_1d_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates a 1D memory copy between host and device using the GPU backend.
+
+        Uses {backend}MemcpyAsync for contiguous memory. For strided memory,
+        {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension.
+        """
+        # ----------- Retrieve relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1)
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call --------------------
+
+        if is_contiguous_copy:
+            # Memory is linear: can use {backend}MemcpyAsync
+            copysize = ' * '.join(sym2cpp(copy_shape))
+            copysize += f' * sizeof({ctype})'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+            call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n'
+
+        else:
+            # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch
+            # This allows copying a strided 1D region
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call = call + generate_sync_debug_call()
+        return call
+
+    def _generate_2d_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates a 2D memory copy using {backend}Memcpy2DAsync.
+
+        Three main cases are handled:
+        - Copy between row-major stored arrays with contiguous rows.
+        - Copy between column-major stored arrays with contiguous columns.
+        - A special case where a 2D copy can still be represented.
+
+        Raises:
+            NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns.
+            Such cases indicate an unsupported 2D copy and should be examined separately.
+            They can be implemented if valid, or a more descriptive error should be raised if the path should not occur.
+
+        Note:
+            {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column),
+            but not both simultaneously.
+        """
+
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call if supported --------------------
+
+        # Case: Row-major layout, rows are not strided.
+        if (src_strides[1] == 1) and (dst_strides[1] == 1):
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[0])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Case: Column-major layout, no columns are strided.
+        elif (src_strides[0] == 1) and (dst_strides[0] == 1):
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[1])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Special case
+        elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]):
+            # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with
+            # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a
+            # {backend}Memcpy2DAsync call!
+
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0] * copy_shape[1])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        else:
+            raise NotImplementedError(
+                f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}."
+                "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken."
+            )
+
+        return call
+
+    def _generate_nd_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates GPU code for copying N-dimensional arrays using 2D memory copies.
+
+        Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops
+        for any outer dimensions. Expects the copy to be contiguous and between
+        row-major storage locations.
+        """
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+        num_dims = len(copy_shape)
+
+        # ----------- Guard for unsupported Pattern --------------
+        if not (src_strides[-1] == 1) and (dst_strides[-1] == 1):
+            src_node, dst_node = copy_context.src_node, copy_context.dst_node
+            src_storage = copy_context.get_storage_type(src_node)
+            dst_storage = copy_context.get_storage_type(dst_node)
+            raise NotImplementedError(
+                "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n"
+                f"  Source node: {src_node} (storage: {src_storage})\n"
+                f"  Destination node: {copy_context.dst_node} (storage: {dst_storage})\n"
+                f"  Source strides: {src_strides}\n"
+                f"  Destination strides: {dst_strides}\n"
+                f"  copy shape: {copy_shape}\n")
+
+        # ----------------- Generate and write backend call(s) --------------------
+
+        call = ""
+        # Write for-loop headers
+        for dim in range(num_dims - 2):
+            call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n"
+
+        # Write Memcopy2DAsync
+        offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2]))
+        offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2]))
+
+        src = f'{src_expr} + {offset_src}'
+        dst = f'{dst_expr} + {offset_dst}'
+
+        dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})'
+        spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})'
+        width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})'
+        height = sym2cpp(copy_shape[-2])
+        kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+        # Generate call and write it
+        call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Write for-loop footers
+        for dim in range(num_dims - 2):
+            call += "\n}"
+
+        # Return the code
+        return call
+
+
+class SyncCollaboritveGPUCopyStrategy(CopyStrategy):
+    """
+    Implements (synchronous) collaborative GPU copy operations.
+
+    This strategy generates the appropriate code for copies performed
+    inside GPU kernels, where multiple threads cooperate to move data
+    between gpu memory spaces (e.g., global to shared memory).
+    """
+
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Checks if the copy is eligible for a collaborative GPU-to-GPU copy.
+
+        Conditions:
+        1. The copy is between two AccessNodes
+        2. The copy is between GPU memory StorageTypes (shared or global).
+        3. The innermost non-sequential map is a GPU_Device-scheduled map i.e.
+           the copy occurs within a kernel but is not within a GPU_ThreadBlock map.
+        """
+        # --- Condition 1: src and dst are AccessNodes ---
+        src_node, dst_node = copy_context.src_node, copy_context.dst_node
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            return False
+
+        # --- Condition 2: GPU to GPU memory transfer ---
+        src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node)
+        gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared}
+
+        if not (src_storage in gpu_storages and dst_storage in gpu_storages):
+            return False
+
+        # --- Condition 3: Next non-sequential Map is a GPU_Device Map ---
+        next_nonseq_parent_map = self._next_non_seq_parent_map(copy_context)
+        if next_nonseq_parent_map is None:
+            return False
+        else:
+            return next_nonseq_parent_map.map.schedule == dtypes.ScheduleType.GPU_Device
+
+    def generate_copy(self, copy_context: CopyContext, kernel_dimensions_maps: Dict[nodes.MapEntry,
+                                                                                    Tuple[List, List]]) -> str:
+        """
+        Generates a GPU copy call as a string using DaCe's runtime CUDA copy functions.
+
+        The function determines the appropriate templated copy function from
+        `dace/libraries/runtime/include/dace/cuda/copy.cuh` and constructs
+        the call string with the necessary arguments, including kernel block
+        dimensions and optional accumulation/reduction information.
+
+        Parameters
+        ----------
+        copy_context : CopyContext
+            Helper object containing information about the copy.
+
+        kernel_dimensions_maps : Dict[nodes.MapEntry, Tuple[List, List]]
+            Kernel map (GPU_Devie scheduled map) entry nodes to (grid_dims, block_dims);
+            block_dims needed in templating.
+
+        Returns
+        -------
+        str
+            The GPU copy call in C++ as a string.
+
+        Notes
+        -----
+        - The kernel block size could be derived, but since this function is typically called
+          from `ExperimentalCUDACodeGen`, it is provided as input to avoid recomputation.
+        - The template functions use a parameter called 'is_async', which is set to True here
+          because `ExperimentalCUDACodeGen` inserts "__syncthreads()" explicitly in tasklets.
+        """
+        # ----------- Retrieve relevant copy information --------------
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+        sdfg = copy_context.sdfg
+        dtype = copy_context.src_node.desc(sdfg).dtype
+        ctype = dtype.ctype
+
+        # Get copy function name (defined in runtime library)
+        num_dims = len(copy_shape)
+        src_node, dst_node = copy_context.src_node, copy_context.dst_node
+        src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node)
+        src_storage_name = self._get_storagename(src_storage)
+        dst_storage_name = self._get_storagename(dst_storage)
+        function_name = f"dace::{src_storage_name}To{dst_storage_name}{num_dims}D"
+
+        # Extract WCR info (accumulation template + optional custom reduction)
+        accum, custom_reduction = self._get_accumulation_info(copy_context)
+        custom_reduction = [custom_reduction] if custom_reduction else []
+
+        # Get parent kernel block dimensions (guaranteed GPU_Device) and sync flag
+        parent_kernel = self._next_non_seq_parent_map(copy_context)
+        block_dims = ", ".join(sym2cpp(kernel_dimensions_maps[parent_kernel][1]))
+        synchronized = "true"  # Legacy 'is_async'; sync barriers handled by passes (see docstring)
+
+        # ------------------------- Generate copy call ----------------------------
+
+        if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape):
+            args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction + dst_strides + copy_shape)
+            args = ", ".join(sym2cpp(args_list))
+            call = f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});"
+
+        elif function_name == "dace::SharedToGlobal1D":
+            copy_size = ', '.join(sym2cpp(copy_shape))
+            accum = accum or '::Copy'
+            args_list = ([src_expr] + src_strides + [dst_expr] + dst_strides + custom_reduction)
+            args = ", ".join(sym2cpp(args_list))
+            call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});"
+
+        else:
+            copy_size = ', '.join(sym2cpp(copy_shape))
+            args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction)
+            args = ", ".join(sym2cpp(args_list))
+            dst_strides_unpacked = ", ".join(sym2cpp(dst_strides))
+            call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});"
+
+        return call
+
+    def _get_accumulation_info(self, copy_context: CopyContext) -> Tuple[str, str]:
+        """
+        Extracts write-conflict resolution (WCR) information from the copy context
+        and returns the accumulation/reduction template components needed for the
+        final templated function call in `generate_copy()`.
+
+        This method processes WCR information from the memlet and generates the
+        appropriate C++ template strings for both predefined and custom reductions.
+
+        Parameters
+        ----------
+        copy_context : CopyContext
+            Copy context containing the copy operation details, including
+            the memlet with WCR information.
+
+        Returns
+        -------
+        Tuple[str, str]
+            A tuple containing:
+            - accum : str
+            Template accumulation string for the function call. Empty string if no WCR,
+            `"::template Accum<ReductionType>"` for predefined reductions, or `"::template Accum"` for custom reductions.
+            - custom_reduction : str
+            C++ formatted custom reduction code string. Empty string for no WCR or predefined reductions,
+            unparsed custom reduction code for custom reductions.
+        """
+        sdfg = copy_context.sdfg
+        dtype = copy_context.src_node.desc(sdfg).dtype
+        memlet = copy_context.edge.data
+        wcr = memlet.wcr
+        reduction_type = operations.detect_reduction_type(wcr)
+
+        if wcr is None:
+            accum, custom_reduction = "", ""
+
+        elif reduction_type != dtypes.ReductionType.Custom:
+            # Use predefined reduction
+            reduction_type_str = str(reduction_type).split(".")[-1]  # e.g., "Sum"
+            accum = f"::template Accum<dace::ReductionType::{reduction_type_str}>"
+            custom_reduction = ""
+
+        else:
+            accum = "::template Accum"
+            custom_reduction = unparse_cr(sdfg, wcr, dtype)
+
+        return accum, custom_reduction
+
+    def _get_storagename(self, storage: dtypes.StorageType):
+        """
+        Returns a string containing the name of the storage location.
+
+        Example: dtypes.StorageType.GPU_Shared will return "Shared".
+        """
+        storage_name = str(storage)
+        return storage_name[storage_name.rindex('_') + 1:]
+
+    def _next_non_seq_parent_map(self, copy_context: CopyContext) -> Optional[nodes.MapEntry]:
+        """
+        Traverse up the parent map chain from the deeper of src_node or dst_node
+        in `copy_context` and return the first parent MapEntry whose schedule
+        is not sequential.
+
+        Parameters
+        ----------
+        copy_context : CopyContext
+            Context information about the memory copy.
+
+        Returns
+        -------
+        Optional[nodes.MapEntry]
+            The first non-sequential parent MapEntry encountered, or None if no
+            such parent exists.
+        """
+        src_node, dst_node = copy_context.src_node, copy_context.dst_node
+        state = copy_context.state
+        scope_dict = state.scope_dict()
+
+        # Determine which node (src or dst) is in the deeper scope
+        deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node
+        current_node = deeper_node
+        while (current_node is None or not isinstance(current_node, nodes.MapEntry)
+               or current_node.map.schedule == dtypes.ScheduleType.Sequential):
+            parent = helpers.get_parent_map(state, current_node)
+            if parent is None:
+                current_node = None
+                break
+            current_node, state = parent
+
+        return current_node
diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py
new file mode 100644
index 0000000000..329547331a
--- /dev/null
+++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py
@@ -0,0 +1,80 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Dict, Union
+from dace import SDFG, nodes
+
+
+class GPUStreamManager:
+    """
+    Manage GPU backend streams (e.g., CUDA or HIP) for nodes in an SDFG.
+
+    Nodes are assigned stream IDs by the NaiveGPUStreamScheduler Pass, and
+    this class provides their access expressions and tracks the number of streams
+    in use. GPU events are not (yet) supported.
+
+    Note
+    ----
+    "Stream" refers to backend GPU streams, not DaCe data streams.
+    """
+
+    def __init__(self, sdfg: SDFG, gpustream_assignments: Dict[nodes.Node, int]):
+        self.sdfg = sdfg
+        self._stream_access_template = "__state->gpu_context->streams[{gpu_stream}]"
+        self._gpustream_assignments = gpustream_assignments
+        self._num_gpu_streams = max(gpustream_assignments.values()) + 1 if gpustream_assignments else 0
+        self._num_gpu_events = 0
+
+    def get_stream_node(self, node: nodes.Node) -> str:
+        """
+        Return the access expression for the GPU stream assigned to a node.
+
+        Parameters
+        ----------
+        node : nodes.Node
+            The node for which to return the access expression of its assigned CUDA stream.
+
+        Returns
+        -------
+        str
+            The GPU stream access expression, e.g.,
+            "__state->gpu_context->streams[0]".
+
+        Raises
+        ------
+        ValueError
+            If the given node does not have an assigned stream.
+        """
+        if node in self.gpustream_assignments:
+            return self._stream_access_template.format(gpu_stream=self.gpustream_assignments[node])
+        else:
+            raise ValueError(f"No GPU stream assigned to node {node}. "
+                             "Check whether the node is relevant for GPU stream assignment and, if it is, "
+                             "inspect the GPU stream pipeline to see why no stream was assigned.")
+
+    def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str:
+        """
+        Returns the GPU stream access expression for an edge.
+
+        Currently unused: edge-level streams were only needed for asynchronous
+        memory-copy operations (e.g., cudaMemcpyAsync). These copies are now
+        modeled via tasklets in the SDFG, so edges do not carry stream info.
+        Implement this if the design changes and edges need streams again.
+        """
+        raise NotImplementedError("Edge-level GPU streams are not supported. "
+                                  "They were previously used for asynchronous memory copies (e.g., cudaMemcpyAsync), "
+                                  "but these are now modeled via tasklets in the SDFG. "
+                                  "Implement this if the design changes and edges must carry GPU stream information.")
+
+    @property
+    def num_gpu_events(self) -> int:
+        """Number of GPU events (currently always 0, left here for potential future support)."""
+        return 0
+
+    @property
+    def num_gpu_streams(self) -> int:
+        """Number of GPU streams in use (stream IDs start at 0)."""
+        return self._num_gpu_streams
+
+    @property
+    def gpustream_assignments(self) -> Dict[nodes.Node, int]:
+        """Mapping of nodes to assigned GPU stream IDs (not all nodes necessarily have a GPU stream ID)."""
+        return self._gpustream_assignments
diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py
new file mode 100644
index 0000000000..27c073afc8
--- /dev/null
+++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py
@@ -0,0 +1,163 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+import functools
+
+import sympy
+from typing import Set, List
+
+import dace
+from dace import Config, data as dt, dtypes
+from dace.sdfg import nodes, SDFGState
+from dace.codegen import common
+from dace.codegen.dispatcher import DefinedType
+from dace.transformation.helpers import get_parent_map
+
+
+def get_cuda_dim(idx):
+    """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """
+    if idx < 0 or idx > 2:
+        raise ValueError(f'idx must be between 0 and 2, got {idx}')
+    return ('x', 'y', 'z')[idx]
+
+
+def product(iterable):
+    """
+    Computes the symbolic product of elements in the iterable using sympy.Mul.
+
+    This is equivalent to: ```functools.reduce(sympy.Mul, iterable, 1)```.
+
+    Purpose: This function is used to improve readability of the codeGen.
+    """
+    return functools.reduce(sympy.Mul, iterable, 1)
+
+
+def to_3d_dims(dim_sizes: List) -> List:
+    """
+    Converts a list of dimension sizes to a 3D format.
+
+    If the list has more than three dimensions, all dimensions beyond the second are
+    collapsed into the third (via multiplication). If the list has fewer than three
+    entries, it is padded with 1s to ensure a fixed length of three.
+
+    Examples:
+        [x]             → [x, 1, 1]
+        [x, y]          → [x, y, 1]
+        [x, y, z]       → [x, y, z]
+        [x, y, z, u, v] → [x, y, z * u * v]
+    """
+
+    if len(dim_sizes) > 3:
+        # multiply everything from the 3rd onward into d[2]
+        dim_sizes[2] = product(dim_sizes[2:])
+        dim_sizes = dim_sizes[:3]
+
+    # pad with 1s if necessary
+    dim_sizes += [1] * (3 - len(dim_sizes))
+
+    return dim_sizes
+
+
+def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: List):
+    """
+    Validates that the given block size for a kernel does not exceed typical CUDA hardware limits.
+
+    These limits are not enforced by the CUDA compiler itself, but are configurable checks
+    performed by DaCe during GPU code generation. They are based on common hardware
+    restrictions and can be adjusted via the configuration system.
+
+    Specifically, this function checks:
+    - That the total number of threads in the block does not exceed `compiler.cuda.block_size_limit`.
+    - That the number of threads in the last (z) dimension does not exceed
+      `compiler.cuda.block_size_lastdim_limit`.
+
+    Raises:
+        ValueError: If either limit is exceeded.
+    """
+    kernel_map_label = kernel_map_entry.map.label
+
+    total_block_size = product(block_size)
+    limit = Config.get('compiler', 'cuda', 'block_size_limit')
+    lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit')
+
+    if (total_block_size > limit) == True:
+        raise ValueError(f'Block size for kernel "{kernel_map_label}" ({block_size}) '
+                         f'is larger than the possible number of threads per block ({limit}). '
+                         'The kernel will potentially not run, please reduce the thread-block size. '
+                         'To increase this limit, modify the `compiler.cuda.block_size_limit` '
+                         'configuration entry.')
+
+    if (block_size[-1] > lastdim_limit) == True:
+        raise ValueError(f'Last block size dimension for kernel "{kernel_map_label}" ({block_size}) '
+                         'is larger than the possible number of threads in the last block dimension '
+                         f'({lastdim_limit}). The kernel will potentially not run, please reduce the '
+                         'thread-block size. To increase this limit, modify the '
+                         '`compiler.cuda.block_size_lastdim_limit` configuration entry.')
+
+
+def generate_sync_debug_call() -> str:
+    """
+    Generate backend sync and error-check calls as a string if
+    synchronous debugging is enabled.
+
+    Parameters
+    ----------
+    backend : str
+        Backend API prefix (e.g., 'cuda').
+
+    Returns
+    -------
+    str
+        The generated debug call code, or an empty string if debugging is disabled.
+    """
+    backend: str = common.get_gpu_backend()
+    sync_call: str = ""
+    if Config.get_bool('compiler', 'cuda', 'syncdebug'):
+        sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
+                     f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")
+
+    return sync_call
+
+
+def get_defined_type(data: dt.Data) -> DefinedType:
+    """
+    Return the DefinedType for a data descriptor.
+    Currently supports only scalars and arrays; extend if others are needed.
+    """
+    if isinstance(data, dt.Scalar):
+        return DefinedType.Scalar
+    elif isinstance(data, dt.Array):
+        return DefinedType.Pointer
+    else:
+        raise NotImplementedError(f"Data type '{type(data).__name__}' is not supported for defined type inference."
+                                  "Only Scalars and Arrays are expected for Kernels.")
+
+
+def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
+    """
+    Checks if the given node is enclosed within a Map whose schedule type
+    matches any in the `schedules` set.
+
+    Parameters
+    ----------
+    state : SDFGState
+        The State where the node resides
+    node : nodes.Node
+        The node to check.
+    schedules : set[dtypes.ScheduleType]
+        A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).
+
+    Returns
+    ----------
+    bool
+        True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
+    """
+    current = node
+
+    while current is not None:
+        if isinstance(current, nodes.MapEntry):
+            if current.map.schedule in schedules:
+                return True
+
+        parent = get_parent_map(state, current)
+        if parent is None:
+            return False
+        current, state = parent
diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py
new file mode 100644
index 0000000000..800b6ab4c8
--- /dev/null
+++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py
@@ -0,0 +1,550 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from abc import ABC, abstractmethod
+
+import dace
+from dace import dtypes, subsets, symbolic
+from dace.sdfg import SDFG, ScopeSubgraphView, nodes, SDFGState
+from dace.sdfg.state import ControlFlowRegion
+from dace.codegen.prettycode import CodeIOStream
+from dace.codegen.targets.framecode import DaCeCodeGenerator
+from dace.codegen.dispatcher import DefinedType, TargetDispatcher
+from dace.transformation import helpers
+from dace.codegen.targets.cpp import sym2cpp
+from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, KernelSpec
+from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import (get_cuda_dim, product)
+
+#----------------------------------------------------------------------------------
+# GPU Scope Generation Strategies
+#----------------------------------------------------------------------------------
+
+
+class ScopeGenerationStrategy(ABC):
+    """Base strategy for generating GPU scope code"""
+
+    def __init__(self, codegen: ExperimentalCUDACodeGen):
+        self.codegen: ExperimentalCUDACodeGen = codegen
+        self._dispatcher: TargetDispatcher = codegen._dispatcher
+        self._current_kernel_spec: KernelSpec = codegen._current_kernel_spec
+
+    @abstractmethod
+    def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                   function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool:
+        raise NotImplementedError('Abstract class')
+
+    @abstractmethod
+    def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                 function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+        raise NotImplementedError('Abstract class')
+
+
+class KernelScopeGenerator(ScopeGenerationStrategy):
+
+    def __init__(self, codegen: ExperimentalCUDACodeGen):
+        super().__init__(codegen)
+
+    def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                   function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool:
+
+        node = dfg_scope.source_nodes()[0]
+        schedule_type = node.map.schedule
+
+        # This strategy starts kernel code generation and is only valid if
+        # the outermost (first) GPU schedule is of type GPU_Device.
+        applicable = schedule_type == dtypes.ScheduleType.GPU_Device
+        return applicable
+
+    def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                 function_stream: CodeIOStream, callsite_stream: CodeIOStream):
+
+        # Generate kernel function signature
+        self._generate_kernel_signature(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream)
+
+        # Generate kernel body
+        with ScopeManager(frame_codegen=self.codegen._frame,
+                          sdfg=sdfg,
+                          cfg=cfg,
+                          dfg_scope=dfg_scope,
+                          state_id=state_id,
+                          function_stream=function_stream,
+                          callsite_stream=callsite_stream,
+                          comment="Kernel scope") as scope_manager:
+
+            # ----------------- Retrieve kernel configuration -----------------------
+
+            kernel_spec = self._current_kernel_spec
+            kernel_entry_node = kernel_spec._kernel_map_entry  # == dfg_scope.source_nodes()[0]
+            kernel_map = kernel_spec.kernel_map
+
+            # ----------------- Kernel/Map Range Preprocessing -----------------------
+
+            reversed_kernel_range = kernel_map.range[::-1]  # also reverse it
+            kernel_range = subsets.Range(reversed_kernel_range)
+            kernel_dimensions = len(kernel_range)
+            kernel_dim_sizes = kernel_range.size()
+
+            # ----------------- Set up symbolic index expressions -----------------------
+
+            symbolic_indices = [
+                symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions)
+            ]
+            symbolic_coordinates = kernel_range.coord_at(symbolic_indices)
+
+            # ----------------- Generate Thread or Block index Definitions -----------------------
+
+            thread_id_ctype = kernel_spec.gpu_index_ctype  # Data type of CUDA thread/block indices
+
+            # In case there is no ThreadBlock map used in a submap, the map variables will
+            # be mapped to thread IDs instead of block IDs
+            for dim in range(kernel_dimensions):
+
+                var_name = kernel_map.params[-dim - 1]  # also reverse it here!
+
+                # Compute index expressions for up to 3 dimensions (x, y, z)
+                if dim < 3:
+                    index_expr = f'blockIdx.{get_cuda_dim(dim)}'
+                    # Delinearize third dimension if more than 3D (used in 3D+ mapping)
+                    if dim == 2 and kernel_dimensions > 3:
+                        tail_prod = product(kernel_dim_sizes[3:])
+                        index_expr = f"({index_expr} / ({sym2cpp(tail_prod)}))"
+
+                else:  # Handle dimensions beyond the third (delinearize and modulo)
+                    index_expr = f'blockIdx.z'
+                    tail_prod = product(kernel_dim_sizes[dim + 1:])
+                    index_expr = (f"(({index_expr} / ({sym2cpp(tail_prod)})) % ({sym2cpp(kernel_dim_sizes[dim])}))")
+
+                # Define thread/Block index
+                var_def = sym2cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr)
+                callsite_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node)
+                self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype)
+
+            self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream, callsite_stream)
+
+            # ----------------- Dispatch Subgraph code generation -----------------------
+
+            self._dispatcher.dispatch_subgraph(sdfg,
+                                               cfg,
+                                               dfg_scope,
+                                               state_id,
+                                               function_stream,
+                                               callsite_stream,
+                                               skip_entry_node=True)
+
+            self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream,
+                                                           callsite_stream)
+
+    def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView,
+                                   state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream):
+
+        kernel_name = self._current_kernel_spec.kernel_name
+        kernel_args = self._current_kernel_spec.args_typed
+        block_dims = self._current_kernel_spec.block_dims
+        node = dfg_scope.source_nodes()[0]
+
+        # Conditionally add __launch_bounds__ for block size optimization.
+        launch_bounds = ''
+        if node.gpu_launch_bounds != '-1':
+            if node.gpu_launch_bounds == "0":
+                if not any(symbolic.issymbolic(b) for b in block_dims):
+                    launch_bounds = f'__launch_bounds__({product(block_dims)})'
+            else:
+                launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})'
+
+        # Emit kernel function signature
+        callsite_stream.write(f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', cfg,
+                              state_id, node)
+
+
+class ThreadBlockScopeGenerator(ScopeGenerationStrategy):
+
+    def __init__(self, codegen: ExperimentalCUDACodeGen):
+        super().__init__(codegen)
+
+    def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                   function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool:
+
+        node = dfg_scope.source_nodes()[0]
+        applicable = node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock
+
+        return applicable
+
+    def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                 function_stream: CodeIOStream, callsite_stream: CodeIOStream):
+
+        # NOTE: not my code, but my insights. Approval for commenting this needed
+        with ScopeManager(frame_codegen=self.codegen._frame,
+                          sdfg=sdfg,
+                          cfg=cfg,
+                          dfg_scope=dfg_scope,
+                          state_id=state_id,
+                          function_stream=function_stream,
+                          callsite_stream=callsite_stream,
+                          comment="ThreadBlock Scope") as scope_manager:
+
+            node = dfg_scope.source_nodes()[0]
+            scope_map = node.map
+
+            # ----------------- Map Range Preprocessing -----------------------
+
+            # Reverse range for better performance (e.g. memory coalescing)
+            reversed_scope_range = scope_map.range[::-1]
+            map_range = subsets.Range(reversed_scope_range)
+            map_dimensions = len(map_range)
+            map_dim_sizes = map_range.size()
+
+            kernel_block_dims = self._current_kernel_spec.block_dims
+
+            # ----------------- Symbolic Index Expressions -----------------------
+
+            symbolic_indices = [
+                symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(map_dimensions)
+            ]
+            symbolic_index_bounds = [
+                idx + (block_dim * rng[2]) - 1
+                for idx, block_dim, rng in zip(symbolic_indices, kernel_block_dims, map_range)
+            ]
+            symbolic_coordinates = map_range.coord_at(symbolic_indices)
+
+            # ----------------- Generate Index Variable Definitions -----------------------
+
+            # Get the block's index dace data type
+            block_id_ctype = self._current_kernel_spec.gpu_index_ctype
+
+            for dim in range(map_dimensions):
+                var_name = scope_map.params[-dim - 1]  # also reverse it here!
+
+                if dim < 3:
+                    # First three dimensions: direct mapping or partial delinearization
+                    if dim == 2 and map_dimensions > 3:
+                        tail_prod = product(map_dim_sizes[3:])
+                        base_expr = f"(threadIdx.z / ({sym2cpp(tail_prod)}))"
+                    else:
+                        base_expr = f"threadIdx.{get_cuda_dim(dim)}"
+                else:
+                    # Dimensions beyond the third: full delinearization
+                    tail_prod = product(map_dim_sizes[dim + 1:])
+                    base_expr = (f"((threadIdx.z / ({sym2cpp(tail_prod)})) % ({sym2cpp(map_dim_sizes[dim])}))")
+
+                var_def = sym2cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr)
+                callsite_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node)
+                self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype)
+
+            self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream)
+
+            # ----------------- Guard Conditions for Block Execution -----------------------
+
+            # Generate conditions for this block's execution using min and max
+            # element, e.g. skipping out-of-bounds threads in trailing block
+            minels = map_range.min_element()
+            maxels = map_range.max_element()
+            for dim, (var_name, start, end) in enumerate(zip(scope_map.params[::-1], minels, maxels)):
+
+                # Optimize conditions if they are always true
+                #############################################
+
+                condition = ''
+
+                # Block range start
+                if dim >= 3 or (symbolic_indices[dim] >= start) != True:
+                    condition += f'{var_name} >= {sym2cpp(start)}'
+
+                # Special case: block size is exactly the range of the map (0:b)
+                if dim >= 3:
+                    skipcond = False
+                else:
+                    skipcond = symbolic_index_bounds[dim].subs({symbolic_indices[dim]: start}) == end
+
+                # Block range end
+                if dim >= 3 or (not skipcond and (symbolic_index_bounds[dim] < end) != True):
+                    if len(condition) > 0:
+                        condition += ' && '
+                    condition += f'{var_name} < {sym2cpp(end + 1)}'
+
+                # Emit condition in code if any
+                if len(condition) > 0:
+                    scope_manager.open(condition=condition)
+
+            # ----------------- Dispatch Subgraph code generation -----------------------
+
+            self._dispatcher.dispatch_subgraph(sdfg,
+                                               cfg,
+                                               dfg_scope,
+                                               state_id,
+                                               function_stream,
+                                               callsite_stream,
+                                               skip_entry_node=True)
+
+            self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream)
+
+
+class WarpScopeGenerator(ScopeGenerationStrategy):
+
+    def __init__(self, codegen: ExperimentalCUDACodeGen):
+        super().__init__(codegen)
+
+    def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                   function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool:
+
+        node = dfg_scope.source_nodes()[0]
+        applicable = node.map.schedule == dtypes.ScheduleType.GPU_Warp
+
+        return applicable
+
+    def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int,
+                 function_stream: CodeIOStream, callsite_stream: CodeIOStream):
+
+        with ScopeManager(frame_codegen=self.codegen._frame,
+                          sdfg=sdfg,
+                          cfg=cfg,
+                          dfg_scope=dfg_scope,
+                          state_id=state_id,
+                          function_stream=function_stream,
+                          callsite_stream=callsite_stream,
+                          comment="WarpLevel Scope") as scope_manager:
+
+            # Get kernel specifications
+            kernel_spec = self._current_kernel_spec
+            block_dims = kernel_spec.block_dims
+            warpSize = kernel_spec.warpSize
+
+            state_dfg = cfg.state(state_id)
+            node = dfg_scope.source_nodes()[0]
+            scope_map = node.map
+
+            map_range = subsets.Range(scope_map.range[::-1])  # Reversed for potential better performance
+            warp_dim = len(map_range)
+
+            # The following sizes and bounds are be symbolic
+            num_threads_in_block = product(block_dims)
+            warp_dim_bounds = [max_elem + 1 for max_elem in map_range.max_element()]
+            num_warps = product(warp_dim_bounds)
+
+            # The C type used to define the (flat) threadId and warpId variables
+            ids_ctype = kernel_spec.gpu_index_ctype
+
+            # ----------------- Guard checks -----------------------
+
+            # handles checks either at compile time or runtime (i.e. checks in the generated code)
+            self._handle_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps,
+                                               callsite_stream, scope_manager)
+
+            # ----------------- Define (flat) Thread ID within Block -----------------------
+
+            flattened_terms = []
+
+            for i, dim_size in enumerate(block_dims):
+
+                if dim_size == 1:
+                    continue
+
+                dim = get_cuda_dim(i)
+                stride = [f"{block_dims[j]}" for j in range(i) if block_dims[j] > 1]
+                idx_expr = " * ".join(stride + [f"threadIdx.{get_cuda_dim(i)}"]) if stride else f"threadIdx.{dim}"
+                flattened_terms.append(idx_expr)
+
+            joined_terms = " + ".join(flattened_terms)
+            flat_thread_idx_expr = f"({joined_terms})" if len(flattened_terms) > 1 else joined_terms
+
+            threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id,
+                                                      state_dfg.node_id(node))
+
+            callsite_stream.write(f"{ids_ctype} {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg,
+                                  state_id, node)
+            self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, ids_ctype)
+
+            # ----------------- Compute Map indices (= Warp indices) -----------------------
+
+            for i in range(warp_dim):
+                var_name = scope_map.params[-i - 1]  # reverse order
+                previous_sizes = warp_dim_bounds[:i]
+
+                if len(previous_sizes) > 0:
+                    divisor = product(previous_sizes)
+                    expr = f"(({threadID_name} / {divisor}) % ({warp_dim_bounds[i]}))"
+                else:
+                    expr = f"({threadID_name} % ({warp_dim_bounds[i]}))"
+
+                callsite_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node)
+                self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype)
+
+            self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream)
+
+            # ----------------- Guard Conditions for Warp Execution -----------------------
+
+            if num_warps * warpSize != num_threads_in_block:
+                condition = f'{threadID_name} < {num_warps}'
+                scope_manager.open(condition)
+
+            warp_range = [(start, end + 1, stride) for start, end, stride in map_range.ranges]
+
+            for dim, (var_name, (start, _, stride)) in enumerate(zip(scope_map.params[::-1], warp_range)):
+
+                condition_terms = []
+
+                if start != 0:
+                    condition_terms.append(f"{var_name} >= {start}")
+
+                if stride != 1:
+                    expr = var_name if start == 0 else f"({var_name} - {start})"
+                    condition_terms.append(f'{expr} % {stride} == 0')
+
+                if condition_terms:
+                    condition = " && ".join(condition_terms)
+                    scope_manager.open(condition)
+
+            # ----------------- Dispatch Subgraph code generation -----------------------
+
+            self._dispatcher.dispatch_subgraph(sdfg,
+                                               cfg,
+                                               dfg_scope,
+                                               state_id,
+                                               function_stream,
+                                               callsite_stream,
+                                               skip_entry_node=True)
+
+            self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream)
+
+    def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range,
+                                      warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream,
+                                      scope_manager: 'ScopeManager'):
+
+        #TODO: Move them to sdfg validation as well if possible
+
+        # Get warpSize from the kernel specification
+        warpSize = self._current_kernel_spec.warpSize
+
+        parent_map, _ = helpers.get_parent_map(state_dfg, node)
+        if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock:
+            raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.")
+
+        if warp_dim > 3:
+            raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.")
+
+        # Guard against invalid thread/block configurations.
+        # - For concrete (compile-time) values, raise Python errors early.
+        # - For symbolic values, insert runtime CUDA checks (guards) into the generated kernel.
+        #   These will emit meaningful error messages and abort execution if violated.
+        if isinstance(num_threads_in_block, symbolic.symbol):
+            condition = (f"{num_threads_in_block} % {warpSize} != 0 || "
+                         f"{num_threads_in_block} > 1024 || "
+                         f"{num_warps} * {warpSize} > {num_threads_in_block}")
+            kernel_stream.write(f"""\
+            if ({condition}) {{
+                printf("CUDA error:\\n"
+                    "1. Block must be a multiple of {warpSize} threads (DaCe requirement for GPU_Warp scheduling).\\n"
+                    "2. Block size must not exceed 1024 threads (CUDA hardware limit).\\n"
+                    "3. Number of warps x {warpSize} must fit in the block (otherwise logic is unclear).\\n");
+                asm("trap;");
+            }}
+            """)
+
+        else:
+            if isinstance(num_warps, symbolic.symbol):
+                condition = f"{num_warps} * {warpSize} > {num_threads_in_block}"
+                scope_manager.open(condition=condition)
+
+            elif num_warps * warpSize > num_threads_in_block:
+                raise ValueError(f"Invalid configuration: {num_warps} warps x {warpSize} threads exceed "
+                                 f"{num_threads_in_block} threads in the block.")
+
+            if num_threads_in_block % warpSize != 0:
+                raise ValueError(f"Block must be a multiple of {warpSize} threads for GPU_Warp scheduling "
+                                 f"(got {num_threads_in_block}).")
+
+            if num_threads_in_block > 1024:
+                raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).")
+
+        for min_element in map_range.min_element():
+            if isinstance(min_element, symbolic.symbol):
+                kernel_stream.write(
+                    f'if ({min_element} < 0) {{\n'
+                    f'    printf("Runtime error: Warp ID symbol {min_element} must be non-negative.\\n");\n'
+                    f'    asm("trap;");\n'
+                    f'}}\n')
+            elif min_element < 0:
+                raise ValueError(f"Warp ID value {min_element} must be non-negative.")
+
+
+#----------------------------------------------------------------------------------
+# Scope Manager, handling brackets and allocation/deallocation of arrays in Scopes
+#----------------------------------------------------------------------------------
+
+
+class ScopeManager:
+    """
+    A helper class to manage opening and closing brackets in a structured way using the 'with' statement.
+    This class simplifies the process of correctly opening and closing brackets. It also supports an optional
+    debug mode to include comments in the generated code, which can help with debugging and understanding
+    the code structure.
+    """
+
+    def __init__(self,
+                 frame_codegen: DaCeCodeGenerator,
+                 sdfg: SDFG,
+                 cfg: ControlFlowRegion,
+                 dfg_scope: ScopeSubgraphView,
+                 state_id: int,
+                 function_stream: CodeIOStream,
+                 callsite_stream: CodeIOStream,
+                 comment: str = None,
+                 brackets_on_enter: bool = True,
+                 debug: bool = False):
+        """
+        Initializes the KernelScopeManager.
+
+        :param frame_codegen: The frame codegenerator used for allocation and deallocation of arrays in scopes
+        :param sdfg: The SDFG instance for context.
+        :param cfg: The ControlFlowRegion instance for context.
+        :param dfg_scope: The ScopeSubgraphView instance for context.
+        :param state_id: The ID of the current state for context.
+        :param function_stream: The CodeIOStream for function-level code.
+        :param callsite_stream: The CodeIOStream for callsite-level code.
+        :param comment: A descriptive comment explaining the purpose of the code block being opened. Default is None.
+        :param brackets_on_enter: Whether on entering (i.e. when using "with", there should be a bracket opened). Default is True. 
+        :param debug: Whether to include debug comments in the output. Defaults to False.
+        """
+        self.frame_codegen = frame_codegen
+        self.sdfg = sdfg
+        self.cfg = cfg
+        self.dfg_scope = dfg_scope
+        self.state_id = state_id
+        self.function_stream = function_stream
+        self.callsite_stream = callsite_stream
+        self.comment = comment
+        self.brackets_on_enter = brackets_on_enter
+        self.debug = debug
+        self._opened = 0
+
+        self.entry_node = self.dfg_scope.source_nodes()[0]
+        self.exit_node = self.dfg_scope.sink_nodes()[0]
+
+    def __enter__(self):
+        """
+        Writes the opening bracket in case self.brackets_on_enter
+        is set to true, which it is by default.
+        """
+        if self.brackets_on_enter:
+            self.open()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        """
+        Writes the closing brackets to the stream.
+        """
+        for i in range(self._opened):
+            line = "}"
+            if self.debug:
+                line += f" // {self.comment} (close {i + 1})"
+            self.callsite_stream.write(line, self.cfg, self.state_id, self.exit_node)
+
+    def open(self, condition: str = None):
+        """
+        Opens a bracket. If a condition is given, emits 'if (condition) {', otherwise just '{'.
+        Tracks the number of open brackets for closing later.
+
+        :param condition: Optional condition for the opening bracket.
+        """
+        line = f"if ({condition}) {{" if condition else "{"
+        if self.debug:
+            line += f" // {self.comment} (open {self._opened + 1})"
+        self.callsite_stream.write(line, self.cfg, self.state_id, self.entry_node)
+        self._opened += 1
diff --git a/dace/codegen/targets/gpu_helpers/copy_strategies.py b/dace/codegen/targets/gpu_helpers/copy_strategies.py
new file mode 100644
index 0000000000..27a5b2c53b
--- /dev/null
+++ b/dace/codegen/targets/gpu_helpers/copy_strategies.py
@@ -0,0 +1,553 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from dace import SDFG, SDFGState, data, dtypes, subsets
+from dace import memlet as mm
+from dace import symbolic
+from dace.codegen import common
+from dace.codegen.targets import cpp
+from dace.codegen.targets.cpp import sym2cpp
+from dace.codegen.targets.gpu_helpers.gpu_utils import generate_sync_debug_call
+from dace.config import Config
+from dace.dtypes import StorageType
+from dace.frontend import operations
+from dace.sdfg import nodes, scope_contains_scope
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.transformation import helpers
+
+
+class CopyContext:
+    """
+    Encapsulates inputs required for copy operations and exposes helper
+    methods to derive additional information. This keeps copy strategies
+    lightweight by letting them focus only on the relevant logic.
+    """
+
+    def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node,
+                 edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]):
+
+        # Store the basic context as attributes
+        self.sdfg = sdfg
+        self.state = state
+        self.src_node = src_node
+        self.dst_node = dst_node
+        self.edge = edge
+        self.gpustream_assignments = gpustream_assignments
+
+        memlet = edge.data
+
+        self.copy_shape = memlet.subset.size_exact()
+        if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode):
+            copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info()
+        else:
+            copy_shape = memlet.subset.size_exact()
+            src_strides = dst_strides = src_expr = dst_expr = None
+
+        self.copy_shape = copy_shape
+        self.src_strides = src_strides
+        self.dst_strides = dst_strides
+        self.src_expr = src_expr
+        self.dst_expr = dst_expr
+
+    def get_storage_type(self, node: nodes.Node):
+        """
+        Return the storage type associated with a given SDFG node.
+
+        Tasklets are assumed to use register storage, while AccessNodes
+        return the storage type from their data descriptor. Raises
+        NotImplementedError for unsupported node types.
+        """
+        if isinstance(node, nodes.Tasklet):
+            storage_type = StorageType.Register
+
+        elif isinstance(node, nodes.AccessNode):
+            storage_type = node.desc(self.sdfg).storage
+
+        else:
+            raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; "
+                                      "expected AccessNode or Tasklet. Please extend this method accordingly.")
+
+        return storage_type
+
+    def get_assigned_gpustream(self) -> str:
+        """
+        Return the GPU stream expression assigned to both source and destination nodes.
+
+        Ensures that both nodes have a matching stream ID, then constructs the
+        variable name from the configured prefix and stream ID. Raises ValueError
+        if assignments are missing or inconsistent.
+
+        Example:
+            If the configured prefix is 'gpu_stream' and the assigned stream ID is 0,
+            this method returns 'gpu_stream0'.
+        """
+        src_stream = self.gpustream_assignments.get(self.src_node)
+        dst_stream = self.gpustream_assignments.get(self.dst_node)
+
+        # 1. Catch unsupported cases
+        if src_stream is None or dst_stream is None:
+            raise ValueError("GPU stream assignment missing for source or destination node.")
+
+        if src_stream != dst_stream:
+            raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', "
+                             f"dst_node has '{dst_stream}'. They must be the same.")
+
+        # 2. Generate GPU stream expression
+        gpustream = src_stream
+        gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
+        gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}"
+
+        return gpustream_expr
+
+    def get_memory_location(self) -> Tuple[str, str]:
+        """
+        Determine whether the source and destination nodes reside in device or host memory.
+
+        Uses the storage type of each node to classify it as either 'Device'
+        (GPU global memory) or 'Host' (all other storage types).
+        Used for GPU related copies outside the kernel (e.g. to construct
+        cudaMemcpyHostToDevice for example).
+
+        Returns
+        -------
+        Tuple[str, str]
+            (src_location, dst_location) where each is either 'Device' or 'Host'.
+        """
+        src_storage = self.get_storage_type(self.src_node)
+        dst_storage = self.get_storage_type(self.dst_node)
+        src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host'
+        dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host'
+
+        return src_location, dst_location
+
+    def get_ctype(self) -> Any:
+        """
+        Determine the C data type (ctype) of the source or destination node.
+
+        The ctype is resolved from the data descriptor of the first node
+        (source or destination) that is an AccessNode (assumed to be the same
+        if both are AccessNodes).
+
+        Returns
+        -------
+        Any
+            The C type string (e.g., "float*", "int32") associated with the node.
+
+        Raises
+        ------
+        NotImplementedError
+            If neither the source nor the destination node is an AccessNode.
+        """
+        sdfg = self.sdfg
+        src_node, dst_node = self.src_node, self.dst_node
+
+        if isinstance(src_node, nodes.AccessNode):
+            return src_node.desc(sdfg).ctype
+
+        if isinstance(dst_node, nodes.AccessNode):
+            return dst_node.desc(sdfg).ctype
+
+        raise NotImplementedError(
+            f"Cannot determine ctype: neither src nor dst node is an AccessNode. "
+            f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. "
+            "Please extend this case or fix the issue.")
+
+    def get_accessnode_to_accessnode_copy_info(self):
+        """
+        Compute copy shape, absolute strides, and pointer expressions for a copy
+        between two AccessNodes. Tries to mimic
+        cpp.memlet_copy_to_absolute_strides without requiring a dispatcher.
+
+        Returns
+        -------
+        (copy_shape, src_strides, dst_strides, src_expr, dst_expr)
+
+        Raises
+        ------
+        TypeError
+            If either endpoint is not an AccessNode.
+        NotImplementedError
+            If a descriptor is not Scalar or Array.
+        """
+
+        # ---------------------------- helpers ----------------------------
+        def _collapse_strides(strides, subset):
+            """Remove size-1 dims; keep tile strides; default to [1] if none remain."""
+            n = len(subset)
+            collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1]
+            collapsed.extend(strides[n:])  # include tiles
+            if len(collapsed) == 0:
+                return [1]
+            return collapsed
+
+        def _ptr_name(desc, name):
+            if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent,
+                                                    dtypes.AllocationLifetime.External):
+                return f'__state->__{sdfg.cfg_id}_{name}'
+            return name
+
+        def _expr_for(desc, name, subset):
+            ptr = _ptr_name(desc, name)
+
+            if isinstance(desc, data.Scalar):
+                # GPU scalar special-case
+                if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN:
+                    parent = state.sdfg.parent_nsdfg_node
+                    if parent is not None and name in parent.in_connectors:
+                        return f"&{ptr}"
+                    return ptr
+                # CPU (or other) scalars
+                return f"&{ptr}"
+
+            if isinstance(desc, data.Array):
+                offset = cpp.cpp_offset_expr(desc, subset)
+                return f"{ptr} + {offset}" if offset != "0" else ptr
+
+            raise NotImplementedError(
+                f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.")
+
+        # ---------------------------- Get copy info ----------------------------
+        # Get needed information
+        src_node, dst_node = self.src_node, self.dst_node
+        sdfg, edge, state = self.sdfg, self.edge, self.state
+        memlet, copy_shape = self.edge.data, self.copy_shape
+
+        # Guard - only applicable if src and dst are AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            raise TypeError(
+                f"get_accessnode_to_accessnode_copy_info requires both source and destination "
+                f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.")
+
+        # Get node descriptors
+        src_nodedesc = src_node.desc(sdfg)
+        dst_nodedesc = dst_node.desc(sdfg)
+
+        # Resolve subsets (fallback to full range)
+        src_subset = memlet.get_src_subset(edge, state)
+        dst_subset = memlet.get_dst_subset(edge, state)
+
+        if src_subset is None:
+            src_subset = subsets.Range.from_array(src_nodedesc)
+
+        if dst_subset is None:
+            dst_subset = subsets.Range.from_array(dst_nodedesc)
+
+        # Get strides
+        src_strides = src_subset.absolute_strides(src_nodedesc.strides)
+        dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides)
+
+        # Try to convert to a degenerate/strided ND copy first
+        result = cpp.ndcopy_to_strided_copy(
+            copy_shape,
+            src_nodedesc.shape,
+            src_strides,
+            dst_nodedesc.shape,
+            dst_strides,
+            memlet.subset,
+            src_subset,
+            dst_subset,
+        )
+
+        if result is not None:
+            copy_shape, src_strides, dst_strides = result
+        else:
+            src_strides = _collapse_strides(src_strides, src_subset)
+            dst_strides = _collapse_strides(dst_strides, dst_subset)
+            copy_shape = [s for s in copy_shape if s != 1] or [1]
+
+        # Extend copy shape to the largest among the data dimensions,
+        # and extend other array with the appropriate strides
+        if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape):
+            if memlet.data == src_node.data:
+                copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape)
+            elif memlet.data == dst_node.data:
+                copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape)
+
+        # Build final expressions
+        src_expr = _expr_for(src_nodedesc, src_node.data, src_subset)
+        dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset)
+
+        return copy_shape, src_strides, dst_strides, src_expr, dst_expr
+
+
+class CopyStrategy(ABC):
+    """Abstract base class for memory copy strategies."""
+
+    @abstractmethod
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Return True if this strategy can handle the given memory copy.
+        """
+        raise NotImplementedError('Abstract class')
+
+    @abstractmethod
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates and returns the copy code for the supported pattern.
+        """
+        raise NotImplementedError('Abstract class')
+
+
+class OutOfKernelCopyStrategy(CopyStrategy):
+    """
+    Copy strategy for memory transfers that occur outside of kernel execution.
+
+    This pattern often occurs when generating host-to-device copies for kernel inputs
+    (since kernels cannot access host memory directly), and device-to-host copies
+    to retrieve results for further processing.
+    """
+
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Determines whether the data movement is a host<->device memory copy.
+
+        This function returns True if:
+        - We are not currently generating kernel code
+        - The copy occurs between two AccessNodes
+        - The data descriptors of source and destination are not views.
+        - The storage types of either src or dst is CPU_Pinned or GPU_Device
+        - We do not have a CPU-to-CPU copy
+        """
+        # Retrieve needed information
+        state = copy_context.state
+        src_node, dst_node = copy_context.src_node, copy_context.dst_node
+
+        # 1. Ensure copy is not occuring within a kernel
+        scope_dict = state.scope_dict()
+        deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node
+
+        parent_map_tuple = helpers.get_parent_map(state, deeper_node)
+        while parent_map_tuple is not None:
+            parent_map, parent_state = parent_map_tuple
+            if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN:
+                return False
+            else:
+                parent_map_tuple = helpers.get_parent_map(parent_state, parent_map)
+
+        # 2. Check whether copy is between two AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            return False
+
+        # 3. The data descriptors of source and destination are not views
+        if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View):
+            return False
+
+        # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device
+        src_storage = copy_context.get_storage_type(src_node)
+        dst_storage = copy_context.get_storage_type(dst_node)
+        if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)
+                or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)):
+            return False
+
+        # 5. Check that this is not a CPU to CPU copy
+        cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned]
+        if src_storage in cpu_storage_types and dst_storage in cpu_storage_types:
+            return False
+
+        return True
+
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """Execute host-device copy with CUDA memory operations"""
+
+        # Guard
+        memlet = copy_context.edge.data
+        if memlet.wcr is not None:
+            src_location, dst_location = copy_context.get_memory_location()
+            raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented')
+
+        # Based on the copy dimension, call appropiate helper function
+        num_dims = len(copy_context.copy_shape)
+        if num_dims == 1:
+            copy_call = self._generate_1d_copy(copy_context)
+
+        elif num_dims == 2:
+            copy_call = self._generate_2d_copy(copy_context)
+
+        else:
+            # sanity check
+            assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}."
+            copy_call = self._generate_nd_copy(copy_context)
+
+        return copy_call
+
+    def _generate_1d_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates a 1D memory copy between host and device using the GPU backend.
+
+        Uses {backend}MemcpyAsync for contiguous memory. For strided memory,
+        {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension.
+        """
+        # ----------- Retrieve relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1)
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call --------------------
+
+        if is_contiguous_copy:
+            # Memory is linear: can use {backend}MemcpyAsync
+            copysize = ' * '.join(sym2cpp(copy_shape))
+            copysize += f' * sizeof({ctype})'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+            call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n'
+
+        else:
+            # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch
+            # This allows copying a strided 1D region
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call = call + generate_sync_debug_call()
+        return call
+
+    def _generate_2d_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates a 2D memory copy using {backend}Memcpy2DAsync.
+
+        Three main cases are handled:
+        - Copy between row-major stored arrays with contiguous rows.
+        - Copy between column-major stored arrays with contiguous columns.
+        - A special case where a 2D copy can still be represented.
+
+        Raises:
+            NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns.
+            Such cases indicate an unsupported 2D copy and should be examined separately.
+            They can be implemented if valid, or a more descriptive error should be raised if the path should not occur.
+
+        Note:
+            {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column),
+            but not both simultaneously.
+        """
+
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call if supported --------------------
+
+        # Case: Row-major layout, rows are not strided.
+        if (src_strides[1] == 1) and (dst_strides[1] == 1):
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[0])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Case: Column-major layout, no columns are strided.
+        elif (src_strides[0] == 1) and (dst_strides[0] == 1):
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[1])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Special case
+        elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]):
+            # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with
+            # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a
+            # {backend}Memcpy2DAsync call!
+
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0] * copy_shape[1])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        else:
+            raise NotImplementedError(
+                f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}."
+                "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken."
+            )
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call = call + generate_sync_debug_call()
+        return call
+
+    def _generate_nd_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates GPU code for copying N-dimensional arrays using 2D memory copies.
+
+        Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops
+        for any outer dimensions. Expects the copy to be contiguous and between
+        row-major storage locations.
+        """
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+        num_dims = len(copy_shape)
+
+        # ----------- Guard for unsupported Pattern --------------
+        if not (src_strides[-1] == 1) and (dst_strides[-1] == 1):
+            src_node, dst_node = copy_context.src_node, copy_context.dst_node
+            src_storage = copy_context.get_storage_type(src_node)
+            dst_storage = copy_context.get_storage_type(dst_node)
+            raise NotImplementedError(
+                "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n"
+                f"  Source node: {src_node} (storage: {src_storage})\n"
+                f"  Destination node: {copy_context.dst_node} (storage: {dst_storage})\n"
+                f"  Source strides: {src_strides}\n"
+                f"  Destination strides: {dst_strides}\n"
+                f"  copy shape: {copy_shape}\n")
+
+        # ----------------- Generate and write backend call(s) --------------------
+
+        call = ""
+        # Write for-loop headers
+        for dim in range(num_dims - 2):
+            call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n"
+
+        # Write Memcopy2DAsync
+        offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2]))
+        offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2]))
+
+        src = f'{src_expr} + {offset_src}'
+        dst = f'{dst_expr} + {offset_dst}'
+
+        dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})'
+        spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})'
+        width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})'
+        height = sym2cpp(copy_shape[-2])
+        kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+        # Generate call and write it
+        call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call += generate_sync_debug_call()
+
+        # Write for-loop footers
+        for dim in range(num_dims - 2):
+            call += "\n}"
+
+        # Return the code
+        return call
diff --git a/dace/codegen/targets/gpu_helpers/gpu_utils.py b/dace/codegen/targets/gpu_helpers/gpu_utils.py
new file mode 100644
index 0000000000..e4c4c1fc38
--- /dev/null
+++ b/dace/codegen/targets/gpu_helpers/gpu_utils.py
@@ -0,0 +1,27 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from dace import Config
+from dace.codegen import common
+
+
+def generate_sync_debug_call() -> str:
+    """
+    Generate backend sync and error-check calls as a string if
+    synchronous debugging is enabled.
+
+    Parameters
+    ----------
+    backend : str
+        Backend API prefix (e.g., 'cuda').
+
+    Returns
+    -------
+    str
+        The generated debug call code, or an empty string if debugging is disabled.
+    """
+    backend: str = common.get_gpu_backend()
+    sync_call: str = ""
+    if Config.get_bool('compiler', 'cuda', 'syncdebug'):
+        sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
+                     f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")
+
+    return sync_call
diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index 72e1f784f9..71cf433eb0 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -326,7 +326,7 @@ required:
                             Additional CUDA architectures (separated by commas)
                             to compile GPU code for, excluding the current
                             architecture on the compiling machine.
-                        default: '60'
+                        default: '86'
 
                     hip_arch:
                         type: str
@@ -403,9 +403,9 @@ required:
                         type: bool
                         title: Synchronous Debugging
                         description: >
-                            Enables Synchronous Debugging mode, where each library call
-                            is followed by full-device synchronization and error checking.
-                        default: false
+                            Enables debugging mode where each asynchronous GPU call is followed by
+                            device-wide synchronization and error checking.
+                        default: False
 
                     libs:
                         type: str
@@ -454,17 +454,81 @@ required:
                             index types are needed to address memory offsets that are beyond the 32-bit
                             range, or to reduce memory usage.
 
-                    allow_implicit_memlet_to_map:
-                        type: bool
-                        title: Allow the implicit conversion of Memlets to Maps during code generation.
-                        default: true
+                    # New configs, needed for ExperimentalCUDACodeGen
+                    implementation:
+                        type: str
+                        title: CUDA codegen implementation
+                        description: >
+                            Choose between available CUDA code generation implementations.
+                            "legacy" is stable, "experimental" is used by Berkay Aydogdu and
+                            Yakup Koray Budanaz for Berkays master-thesis.
+                        enum: [legacy, experimental]
+                        default: experimental
+
+                    gpu_index_type:
+                        type: str
+                        title: Thread/block/warp index data type
+                        default: int32
+                        description: >
+                            Defines the data type for a thread, block and warp index in the generated code.
+                            The type is based on the type-classes in ``dace.dtypes``. For example,
+                            ``uint64`` is equivalent to ``dace.uint64``. Change this setting when large
+                            index types are needed to address memory offsets that are beyond the 32-bit
+                            range, or to reduce memory usage. This replaces ``thread_id_type`` in
+                            ``ExperimentalCUDACodeGen`` , as the new name more accurately reflects its broader
+                            usage.
+
+                    cuda_warp_size:
+                        type: int
+                        title: CUDA warp size
+                        description: >
+                            Defines the warp size used during CUDA code generation. The default and current
+                            standard value for CUDA is 32. This should only be changed if future CUDA
+                            architectures explicitly alter the warp size. Modifying this value arbitrarily may
+                            result in incorrect or unknown behavior, and is therefore strongly discouraged.
+                        default: 32
+
+                    hip_warp_size:
+                        type: int
+                        title: HIP warp size
                         description: >
-                            If ``true`` the code generator will implicitly convert Memlets that cannot be
-                            represented by a native library call, such as ``cudaMemcpy()`` into Maps that
-                            explicitly copy the data around. If this value is ``false`` the code generator
-                            will raise an exception if such a Memlet is encountered. This allows the user
-                            to have full control over all Maps in the SDFG.
+                            Specifies the warp size (also known as wavefront size) for HIP code generation.
+                            The default value for AMD GPUs is typically 64. This setting should only be modified
+                            if you have a clear understanding of what you are doing.
+                        default: 64
 
+                    auto_syncthreads_insertion:
+                        type: bool
+                        title: Insert Default __syncthreads() Tasklets
+                        description: >
+                            If enabled, inserts default __syncthreads() tasklets during preprocessing
+                            in ExperimentalCUDACodeGen to ensure shared memory is ready before access.
+                            This is a simple safeguard for correctness—it may not be complete, but it
+                            does the job for basic SDFGs. Disable if you handle synchronization manually
+                            or use other mechanisms like async copies or pipelines.
+                        default: True
+
+                    current_thread_block_name:
+                        type: str
+                        title: Variable name for the current thread block
+                        description: >
+                            Specifies the name of the variable that holds the current thread block group,
+                            initialized using `cooperative_groups::this_thread_block()`. This is useful in
+                            contexts like custom tasklets, where the variable is explicitly referenced
+                            (e.g., `cooperative_groups::wait(block)`). Setting this allows users to customize the
+                            variable name without modifying the source code or relying on a fixed name.
+                        default: block
+
+                    gpu_stream_name:
+                        type: str
+                        title: Name for the GPU stream object
+                        description: >
+                            GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously
+                            and in parallel. This field specifies the naming convention for the hpu stream array and its connectors
+                            in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the
+                            stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a
+                            connector for gpu_streams[0].
+                        default: gpu_streams,gpu_stream
 
             #############################################
             # General FPGA flags
diff --git a/dace/dtypes.py b/dace/dtypes.py
index faadc84a50..b11c8b3bb1 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -77,6 +77,7 @@ class ScheduleType(aenum.AutoNumberEnum):
     Snitch = ()
     Snitch_Multicore = ()
     FPGA_Multi_Pumped = ()  #: Used for double pumping
+    GPU_Warp = ()
 
 
 # A subset of GPU schedule types
@@ -87,6 +88,19 @@ class ScheduleType(aenum.AutoNumberEnum):
     ScheduleType.GPU_Persistent,
 ]
 
+# A subset of GPU schedule types for ExperimentalCUDACodeGen
+GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN = [
+    ScheduleType.GPU_Device,
+    ScheduleType.GPU_ThreadBlock,
+    ScheduleType.GPU_Warp,
+]
+
+# A subset of on-GPU storage types for ExperimentalCUDACodeGen
+GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN = [
+    StorageType.GPU_Global,
+    StorageType.GPU_Shared,
+]
+
 # A subset of CPU schedule types
 CPU_SCHEDULES = [
     ScheduleType.CPU_Multicore,
@@ -204,7 +218,8 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register,
     ScheduleType.FPGA_Device: StorageType.FPGA_Global,
     ScheduleType.SVE_Map: StorageType.CPU_Heap,
-    ScheduleType.Snitch: StorageType.Snitch_TCDM
+    ScheduleType.Snitch: StorageType.Snitch_TCDM,
+    ScheduleType.GPU_Warp: StorageType.Register,
 }
 
 # Maps from ScheduleType to default ScheduleType for sub-scopes
@@ -225,7 +240,8 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device,
     ScheduleType.SVE_Map: ScheduleType.Sequential,
     ScheduleType.Snitch: ScheduleType.Snitch,
-    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore
+    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore,
+    ScheduleType.GPU_Warp: ScheduleType.Sequential,
 }
 
 # Maps from StorageType to a preferred ScheduleType for helping determine schedules.
@@ -1266,6 +1282,7 @@ def isconstant(var):
 complex128 = typeclass(numpy.complex128)
 string = stringtype()
 MPI_Request = opaque('MPI_Request')
+gpuStream_t = opaque('gpuStream_t')
 
 
 @undefined_safe_enum
@@ -1286,6 +1303,7 @@ class Typeclasses(aenum.AutoNumberEnum):
     float64 = float64
     complex64 = complex64
     complex128 = complex128
+    gpuStream_t = gpuStream_t
 
 
 _bool = bool
diff --git a/dace/registry.py b/dace/registry.py
index 08efeb65ed..bab0fa4ade 100644
--- a/dace/registry.py
+++ b/dace/registry.py
@@ -37,6 +37,23 @@ def autoregister(cls: Type, **kwargs):
     that automatically registers the subclass with the superclass registry upon
     creation.
     """
+    # Ensures that the correct CUDA implementation is selected and the other is not registered.
+    # Registering both leads to errors.
+    from dace.config import Config
+
+    name = kwargs.get('name')
+    impl = Config.get('compiler', 'cuda', 'implementation')
+
+    valid_impls = {'legacy', 'experimental'}
+    if impl not in valid_impls:
+        raise ValueError(f"Invalid CUDA implementation: {impl}. "
+                         f"Please select one of {valid_impls} under compiler.cuda.implementation in the configs.")
+
+    # Only the CUDA implementation selected in Config is registered
+    if name in {'cuda', 'experimental_cuda'}:
+        if (impl == 'experimental' and name == 'cuda') or (impl == 'legacy' and name == 'experimental_cuda'):
+            return
+
     registered = False
     for base in cls.__bases__:
         if hasattr(base, '_registry_') and hasattr(base, 'register'):
diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 31ab055b48..3e7a1e450d 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -285,6 +285,16 @@ class AccessNode(Node):
     instrument_condition = CodeProperty(desc="Condition under which to trigger the instrumentation",
                                         default=CodeBlock("1", language=dtypes.Language.CPP))
 
+    # Experimental-CUDA-specific properties
+    async_copy = Property(dtype=bool,
+                          desc="Marks the data copy to this node (if any) as asynchronous (CUDA-specific).",
+                          default=False)
+
+    async_pipeline = Property(dtype=str,
+                              desc="Name of the CUDA pipeline responsible for synchronization. "
+                              "Only relevant if async_copy is True. May be None.",
+                              allow_none=True)
+
     def __init__(self, data, debuginfo=None):
         super(AccessNode, self).__init__()
 
@@ -312,6 +322,9 @@ def __deepcopy__(self, memo):
 
         node._guid = graph.generate_element_id(node)
 
+        node._async_copy = self._async_copy
+        node._async_pipeline = self._async_pipeline
+
         return node
 
     @property
@@ -933,6 +946,9 @@ def used_symbols_within_scope(self, parent_state: 'dace.SDFGState', all_symbols:
 
             free_symbols |= e.data.used_symbols(all_symbols, e)
 
+        # Update with the symbols needed by the map
+        free_symbols |= self.free_symbols
+
         # Do not consider SDFG constants as symbols
         new_symbols.update(set(parent_sdfg.constants.keys()))
         return free_symbols - new_symbols
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index bda9d8707e..d37ef6dae1 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -505,6 +505,8 @@ class SDFG(ControlFlowRegion):
                                            default=False,
                                            desc="Whether the SDFG contains explicit control flow constructs")
 
+    metadata = Property(dtype=dict, desc="Metada attached to the SDFG", default=None, allow_none=True)
+
     def __init__(self,
                  name: str,
                  constants: Dict[str, Tuple[dt.Data, Any]] = None,
@@ -597,6 +599,9 @@ def __deepcopy__(self, memo):
             if fixed:
                 warnings.warn(f'Fixed {fixed} nested SDFG parent references during deep copy.')
 
+        # copy metadata
+        result._metadata = copy.deepcopy(self._metadata, memo)
+
         return result
 
     @property
diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index d558053d3d..2f656111f2 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -405,6 +405,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto
         if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()):
             return result
 
+        # For the (new) gpu stream handling we can have dynamic out connectors, e.g.
+        # KernelExit: stream ->  None: AccessNode, where AccessNode accesses a Stream array
+        # Memlets are used but its not about seing how data flows
+        if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device
+                and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t):
+            return result
+
         # Prepend incoming edges until reaching the source node
         curedge = edge
         visited = set()
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 2cb66bc765..0d07b2f3e5 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -356,6 +356,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                             "Arrays that use a multibank access pattern must have the size of the first dimension equal"
                             f" the number of banks and have at least 2 dimensions for array {name}", sdfg, None)
 
+        # Check for interstate edges that write to scalars or arrays
+        _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg)
+
         # Check if SDFG is located within a GPU kernel
         context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None)
         context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None)
@@ -379,6 +382,17 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
         raise
 
 
+def _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg: 'dace.sdfg.SDFG'):
+    from dace.sdfg import InterstateEdge
+    for edge, graph in sdfg.all_edges_recursive():
+        if edge.data is not None and isinstance(edge.data, InterstateEdge):
+            # sdfg.arrays return arrays and scalars, it is invalid to write to them
+            if any([key in graph.sdfg.arrays for key in edge.data.assignments]):
+                raise InvalidSDFGInterstateEdgeError(
+                    f'Assignment to a scalar or an array detected in an interstate edge: "{edge}"', graph.sdfg,
+                    graph.edge_id(edge))
+
+
 def _accessible(sdfg: 'dace.sdfg.SDFG', container: str, context: Dict[str, bool]):
     """
     Helper function that returns False if a data container cannot be accessed in the current SDFG context.
@@ -906,9 +920,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                          for oe in state.out_edges(dst_node)}):
                         pass
                 else:
-                    raise InvalidSDFGEdgeError(
-                        f"Memlet creates an invalid path (sink node {dst_node}"
-                        " should be a data node)", sdfg, state_id, eid)
+                    if isinstance(dst_node, nd.Tasklet) and len(dst_node.in_connectors) == 0 and len(
+                            dst_node.out_connectors) == 0:
+                        # Tasklets with no input or output connector -> sync tasklet -> OK
+                        pass
+                    else:
+                        raise InvalidSDFGEdgeError(
+                            f"Memlet creates an invalid path (sink node {dst_node}"
+                            " should be a data node)", sdfg, state_id, eid)
         # If scope(dst) is disjoint from scope(src), it's an illegal memlet
         else:
             raise InvalidSDFGEdgeError("Illegal memlet between disjoint scopes", sdfg, state_id, eid)
diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
index 4875279bea..cbccc6da37 100644
--- a/dace/transformation/helpers.py
+++ b/dace/transformation/helpers.py
@@ -1552,6 +1552,38 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio
     return None
 
 
+def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
+    """
+    Checks if the given node is enclosed within a Map whose schedule type
+    matches any in the `schedules` set.
+
+    Parameters
+    ----------
+    state : SDFGState
+        The State where the node resides
+    node : nodes.Node
+        The node to check.
+    schedules : set[dtypes.ScheduleType]
+        A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).
+
+    Returns
+    ----------
+    bool
+        True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
+    """
+    current = node
+
+    while current is not None:
+        if isinstance(current, nodes.MapEntry):
+            if current.map.schedule in schedules:
+                return True
+
+        parent = get_parent_map(state, current)
+        if parent is None:
+            return False
+        current, state = parent
+
+
 def redirect_edge(state: SDFGState,
                   edge: graph.MultiConnectorEdge[Memlet],
                   new_src: Optional[nodes.Node] = None,
diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py
index de1dfcf645..fe0ed80e41 100644
--- a/dace/transformation/interstate/gpu_transform_sdfg.py
+++ b/dace/transformation/interstate/gpu_transform_sdfg.py
@@ -619,7 +619,70 @@ def _create_copy_out(arrays_used: Set[str]) -> Dict[str, str]:
                     block.replace_meta_accesses({devicename: hostname})
 
         # Step 9: Simplify
-        if not self.simplify:
+        if self.simplify:
+            sdfg.simplify()
+
+        ########################################################################
+        # In case the ExperimentalCUDACodeGen is selected, we handle, for backwards
+        # compatibility, the use of in-kernel, transient GPU_Global stored array here.
+        from dace.config import Config
+        if not Config.get('compiler', 'cuda', 'implementation') == 'experimental':
             return
 
-        sdfg.simplify()
+        # import needed modules
+        from dace.transformation import helpers
+        from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel
+        import warnings
+
+        # Detect transient GPU_Global arrays inside GPU_Device-scheduled maps
+        transients_in_kernels: Set[Tuple[str, data.Array, nodes.MapEntry]] = set()
+        transient_outside_kernels: Set[Tuple[str, data.Array]] = set()
+
+        for node, parent in sdfg.all_nodes_recursive():
+            # ---------- Consider only transient GPU_Global arrays -------
+            if not isinstance(node, nodes.AccessNode):
+                continue
+
+            desc = node.desc(parent)
+            if not isinstance(desc, data.Array):
+                continue
+            if not desc.transient:
+                continue
+            if desc.storage != dtypes.StorageType.GPU_Global:
+                continue
+
+            #------- Check whether transient/access node occurs within a kernel --------
+            in_kernel = False
+            parent_map_info = helpers.get_parent_map(state=parent, node=node)
+            while parent_map_info is not None:
+                map_entry, map_state = parent_map_info
+                if (isinstance(map_entry, nodes.MapEntry) and map_entry.map.schedule == dtypes.ScheduleType.GPU_Device):
+                    in_kernel = True
+                    break
+                parent_map_info = helpers.get_parent_map(map_state, map_entry)
+
+            if in_kernel:
+                transients_in_kernels.add((node.data, desc, map_entry))
+            else:
+                transient_outside_kernels.add((node.data, desc))
+
+        # Skip transients that are used outside of GPU kernels, unless a separate, strictly kernel-local
+        # transient with the same name exists inside a kernel. In such cases, 'MoveArrayOutOfKernel' is
+        # still applied to the local one, and naming conflicts are handled automatically.
+        transient_defined_inside_kernel: Set[Tuple[str, nodes.MapEntry]] = set()
+        for data_name, array_desc, kernel_entry in transients_in_kernels:
+            if (data_name, array_desc) in transient_outside_kernels:
+                continue
+            else:
+                transient_defined_inside_kernel.add((data_name, kernel_entry))
+
+        # Apply the pass and warn the user of its use
+        for data_name, kernel_entry in transient_defined_inside_kernel:
+            warnings.warn(
+                f"Transient array '{data_name}' with storage type GPU_Global detected inside kernel {kernel_entry}. "
+                "GPU_Global memory cannot be allocated within GPU kernels, so this usage is semantically invalid. "
+                "As a best-effort fix, the array will be lifted outside the kernel as a non-transient GPU_Global array. "
+                "Any naming conflicts are resolved automatically. "
+                "Please avoid this pattern, as it is strongly discouraged and may lead to undefined behavior. "
+                "Note that this fix provides no guarantees, especially for unusual or complex use cases.")
+            MoveArrayOutOfKernel().apply_pass(sdfg, kernel_entry, data_name)
diff --git a/dace/transformation/passes/analysis/infer_const_args.py b/dace/transformation/passes/analysis/infer_const_args.py
new file mode 100644
index 0000000000..0f66d49732
--- /dev/null
+++ b/dace/transformation/passes/analysis/infer_const_args.py
@@ -0,0 +1,36 @@
+import dace
+from dace.transformation import pass_pipeline as ppl, transformation
+from typing import Dict, Set, Tuple
+from dace import properties
+import dace.sdfg.utils as sdutils
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InferConstantArguments(ppl.Pass):
+    """
+    Evaluates which symbols and data are const within a scope.
+    """
+
+    CATEGORY: str = 'Analysis'
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Nothing
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return modified & ppl.Modifies.CFG & ppl.Modifies.SDFG & ppl.Modifies.Nodes
+
+    def depends_on(self):
+        return {}
+
+    def apply_pass(self, sdfg: dace.SDFG, pipeline_res: Dict) -> Dict[str, Tuple[Set[str], Set[str]]]:
+        const_args_dict = dict()
+        for node, parent_graph in sdfg.all_nodes_recursive():
+            if isinstance(node, dace.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device:
+                const_args_dict[node.guid] = (sdutils.get_constant_data(node, parent_state=parent_graph),
+                                              sdutils.get_constant_symbols(node, parent_state=parent_graph))
+            elif isinstance(node, dace.sdfg.nodes.NestedSDFG):
+                const_args_dict[node.guid] = (sdutils.get_constant_data(node.sdfg, parent_state=parent_graph),
+                                              sdutils.get_constant_symbols(node.sdfg, parent_state=parent_graph))
+
+        return const_args_dict
diff --git a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py
new file mode 100644
index 0000000000..0421d02049
--- /dev/null
+++ b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py
@@ -0,0 +1,170 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+import warnings
+from typing import Dict, List, Set, Tuple
+
+import sympy
+
+from dace import SDFG, SDFGState, dtypes, symbolic
+from dace.codegen.targets.experimental_cuda_helpers import gpu_utils
+from dace.sdfg import nodes
+from dace.transformation import helpers, pass_pipeline as ppl
+
+
+class InferGPUGridAndBlockSize(ppl.Pass):
+    """
+    Infers the 3D CUDA launch configuration (grid and block sizes) for all GPU_Device map entries in the SDFG.
+
+    This pass assumes the `AddThreadBlockMap` transformation has already been applied, ensuring that each kernel
+    either has an explicit thread block map. However it is applicable as long as each GPU_Device scheduled map
+    has an inner explicit GPU_ThreadBlock scheduled map.
+
+    Block sizes are determined based on:
+    - Whether an explicit GPU_ThreadBlock map was inserted by `AddThreadBlockMap`. In this case,
+      the `gpu_block_size` attribute holds this information.
+    - Existing nested thread block maps and also the `gpu_block_size`, if present.
+
+    Grid sizes are computed from the kernel map's range, normalized to a 3D shape.
+
+    NOTE:
+        This pass does not handle dynamic parallelism (i.e., nested GPU_Device maps),
+        nor does it support GPU_ThreadBlock_Dynamic maps inside kernels. Behavior is unclear in
+        such cases.
+    """
+
+    def apply_pass(self, sdfg: SDFG,
+                   kernels_with_added_tb_maps: Set[nodes.MapEntry]) -> Dict[nodes.MapEntry, Tuple[List, List]]:
+        """
+        Analyzes the given SDFG to determine the 3D grid and block sizes for all GPU_Device map entries.
+
+        Returns:
+            A dictionary mapping each GPU_Device MapEntry node to a tuple (grid_dimensions, block_dimensions).
+        """
+        # Collect all GPU_Device map entries across the SDFG
+        kernel_maps: Set[Tuple[
+            nodes.MapEntry,
+            SDFGState,
+        ]] = set()
+        for node, state in sdfg.all_nodes_recursive():
+            if isinstance(node, nodes.MapEntry) and node.schedule == dtypes.ScheduleType.GPU_Device:
+                kernel_maps.add((node, state))
+
+        kernel_dimensions_map: Dict[nodes.MapEntry, Tuple[List, List]] = dict()
+        for map_entry, state in kernel_maps:
+            # Compute grid size
+            raw_grid = map_entry.map.range.size(True)[::-1]
+            grid_size = gpu_utils.to_3d_dims(raw_grid)
+
+            # Compute Block size
+            if map_entry in kernels_with_added_tb_maps:
+                block_size = self._get_inserted_gpu_block_size(map_entry)
+            else:
+                block_size = self._infer_gpu_block_size(state, map_entry)
+
+            block_size = gpu_utils.to_3d_dims(block_size)
+            gpu_utils.validate_block_size_limits(map_entry, block_size)
+
+            kernel_dimensions_map[map_entry] = (grid_size, block_size)
+
+        return kernel_dimensions_map
+
+    def _get_inserted_gpu_block_size(self, kernel_map_entry: nodes.MapEntry) -> List:
+        """
+        Returns the block size from a kernel map entry with an inserted thread-block map.
+
+        Assumes the `gpu_block_size` attribute is set by the AddThreadBlockMap transformation.
+        """
+        gpu_block_size = kernel_map_entry.map.gpu_block_size
+
+        if gpu_block_size is None:
+            raise ValueError("Expected 'gpu_block_size' to be set. This kernel map entry should have been processed "
+                             "by the AddThreadBlockMap transformation.")
+
+        return gpu_block_size
+
+    def _infer_gpu_block_size(self, state: SDFGState, kernel_map_entry: nodes.MapEntry) -> List:
+        """
+        Infers the GPU block size for a kernel map entry based on nested GPU_ThreadBlock maps.
+
+        If the `gpu_block_size` attribute is set, it is assumed to be user-defined (not set by
+        a transformation like `AddThreadBlockMap`), and all nested thread-block maps must fit within it.
+        Otherwise, the block size is inferred by overapproximating the range sizes of all inner
+        GPU_ThreadBlock maps of kernel_map_entry.
+
+
+        Example:
+            for i in dace.map[0:N:32] @ GPU_Device:
+                for j in dace.map[0:32] @ GPU_ThreadBlock:
+                    ...
+                for l in dace.map[0:23] @ GPU_ThreadBlock:
+                    for k in dace.map[0:16] @ GPU_ThreadBlock:
+                        ...
+
+        Inferred GPU block size is [32, 1, 1]
+        """
+        # Identify nested threadblock maps
+        threadblock_maps = self._get_internal_threadblock_maps(state, kernel_map_entry)
+
+        # guard check
+        if not threadblock_maps:
+            raise ValueError(f"{self.__class__.__name__} expects at least one explicit nested GPU_ThreadBlock map, "
+                             "as it assumes AddThreadBlockMap was applied beforehand.\n"
+                             f"Check for issues in that transformation or ensure AddThreadBlockMap was applied.")
+
+        # Overapproximated block size enclosing all inner ThreadBlock maps
+        block_size = kernel_map_entry.map.gpu_block_size
+        detected_block_sizes = [block_size] if block_size is not None else []
+        for tb_map in threadblock_maps:
+
+            # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32)
+            # and collapse to GPU-compatible 3D dimensions
+            tb_size = [symbolic.overapproximate(s) for s in tb_map.range.size()[::-1]]
+            tb_size = gpu_utils.to_3d_dims(tb_size)
+
+            if block_size is None:
+                block_size = tb_size
+            else:
+                block_size = [sympy.Max(sz1, sz2) for sz1, sz2 in zip(block_size, tb_size)]
+
+            if block_size != tb_size or len(detected_block_sizes) == 0:
+                detected_block_sizes.append(tb_size)
+
+        # Check for conflicting or multiple thread-block sizes
+        # - If gpu_block_size is explicitly defined (by the user) and conflicts with detected map sizes, raise an error
+        # - Otherwise, emit a warning when multiple differing sizes are detected, and over-approximate
+        if len(detected_block_sizes) > 1:
+            kernel_map_label = kernel_map_entry.map.label
+
+            if kernel_map_entry.map.gpu_block_size is not None:
+                raise ValueError('Both the `gpu_block_size` property and internal thread-block '
+                                 'maps were defined with conflicting sizes for kernel '
+                                 f'"{kernel_map_label}" (sizes detected: {detected_block_sizes}). '
+                                 'Use `gpu_block_size` only if you do not need access to individual '
+                                 'thread-block threads, or explicit block-level synchronization (e.g., '
+                                 '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or '
+                                 '`GPU_ThreadBlock_Dynamic` schedules. For more information, see '
+                                 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
+
+            else:
+                warnings.warn('Multiple thread-block maps with different sizes detected for '
+                              f'kernel "{kernel_map_label}": {detected_block_sizes}. '
+                              f'Over-approximating to block size {block_size}.\n'
+                              'If this was not the intent, try tiling one of the thread-block maps to match.')
+
+        return block_size
+
+    def _get_internal_threadblock_maps(self, state: SDFGState,
+                                       kernel_map_entry: nodes.MapEntry) -> List[nodes.MapEntry]:
+        """
+        Returns GPU_ThreadBlock MapEntries nested within a given the GPU_Device scheduled kernel map
+        (kernel_map_entry).
+
+        Returns:
+            A List of GPU_ThreadBlock scheduled maps.
+        """
+        threadblock_maps = []
+
+        for _, scope in helpers.get_internal_scopes(state, kernel_map_entry):
+            if isinstance(scope, nodes.MapEntry) and scope.schedule == dtypes.ScheduleType.GPU_ThreadBlock:
+                threadblock_maps.append(scope)
+
+        return threadblock_maps
diff --git a/dace/transformation/passes/fix_test.py b/dace/transformation/passes/fix_test.py
new file mode 100644
index 0000000000..80caa2d563
--- /dev/null
+++ b/dace/transformation/passes/fix_test.py
@@ -0,0 +1,110 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import numpy as np
+import sympy as sp
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.codegen import common
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class Fix(ppl.Pass):
+    """
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Descriptors | ppl.Modifies.Nodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, dace.data.Data]:
+
+        from dace.transformation.helpers import get_parent_map
+
+        skip = set()
+        to_be_moved = set()
+        names: Dict = dict()
+        for node, parent_state in sdfg.all_nodes_recursive():
+            if not isinstance(node, nodes.AccessNode):
+                continue
+
+            map_parent = None
+            state = parent_state
+            current = node
+            while current is not None:
+                if isinstance(current, nodes.MapEntry):
+                    if current.map.schedule == dace.dtypes.ScheduleType.GPU_Device:
+                        map_parent = current
+                        break
+
+                parent = get_parent_map(state, current)
+                if parent is None:
+                    break
+                current, state = parent
+
+            if map_parent is None:
+                continue
+
+            if node.data not in parent_state.sdfg.arrays:
+                continue
+
+            data_desc = node.desc(parent_state)
+            if not data_desc.storage == dtypes.StorageType.Register:
+                continue
+
+            if isinstance(data_desc, dace.data.View) or data_desc.lifetime == dtypes.AllocationLifetime.Persistent:
+                continue
+
+            break_cond = False
+            for edge, parent in sdfg.all_edges_recursive():
+                if not isinstance(parent, dace.SDFGState):
+                    continue
+                src = edge.src
+                if edge.dst_conn == node.data and isinstance(src, nodes.AccessNode) and src.data != node.data:
+                    break_cond = True
+                    skip.add(src.data)
+
+            if break_cond:
+                continue
+
+            shape = data_desc.shape
+            size_expr = np.prod(shape)
+
+            # Try to evaluate the inequality
+            cmp = sp.simplify(size_expr > 64)
+
+            if cmp is sp.true:  # definitely larger
+                move_out = True
+            elif cmp is sp.false:  # definitely safe
+                move_out = False
+            else:
+                # TODO: explain yakup and myself
+                # undecidable case (symbolic expression)
+                move_out = False  # or warn, depending on policy
+
+            if move_out:
+                to_be_moved.add((node.data, data_desc, map_parent))
+
+        for name, desc, map_parent in to_be_moved:
+            if name in skip:
+                continue
+
+            desc.storage = dtypes.StorageType.GPU_Global
+            desc.transient = True
+            names[name] = map_parent
+
+        return names
diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py
new file mode 100644
index 0000000000..225dba00e4
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py
@@ -0,0 +1,70 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.codegen import common
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class ConnectGPUStreamsToKernels(ppl.Pass):
+    """
+    This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).
+
+    Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
+    indicating which GPU stream each kernel is assigned to. These assignments are e.g.
+    used when launching the kernels.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream array name and the prefix for individual stream variables
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Link kernels to their assigned GPU streams
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a kernel entry - continue
+                    if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Assign the GPU stream to the kernel entry
+                    kernel_entry = node
+                    kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_in = state.add_access(stream_array_name)
+                    state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
+                                   dace.Memlet(accessed_gpu_stream))
+
+                    # Assign the GPU stream to the kernel exit
+                    kernel_exit = state.exit_node(kernel_entry)
+                    kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_out = state.add_access(stream_array_name)
+                    state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None,
+                                   dace.Memlet(accessed_gpu_stream))
+
+        return {}
diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py
new file mode 100644
index 0000000000..58d9ff70ff
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py
@@ -0,0 +1,80 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels
+
+# Placeholder for the GPU stream variable used in tasklet code
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class ConnectGPUStreamsToTasklets(ppl.Pass):
+    """
+    This pass ensures that tasklets which require access to their assigned GPU stream
+    are provided with it explicitly.
+
+    Such tasklets typically originate from expanded LibraryNodes targeting GPUs.
+    These nodes may reference the special placeholder variable `__dace_current_stream`,
+    which is expected to be defined during unparsing in `cpp.py`.
+
+    To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use
+    the GPU stream AccessNode directly.
+
+    Note that this pass is similar to `ConnectGPUStreamsToKernels`.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream's array name
+        stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code
+        # and provide them the needed GPU stream explicitly
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a tasklet - continue
+                    if not isinstance(node, nodes.Tasklet):
+                        continue
+
+                    # Tasklet does not need use its assigned GPU stream - continue
+                    if not STREAM_PLACEHOLDER in node.code.as_string:
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_conn = STREAM_PLACEHOLDER
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Provide the GPU stream explicitly to the tasklet
+                    stream_array_in = state.add_access(stream_array_name)
+                    stream_array_out = state.add_access(stream_array_name)
+
+                    node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t)
+                    node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True)
+
+                    state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream))
+                    state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream))
+
+        return {}
diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py
new file mode 100644
index 0000000000..0ad3c2e7c0
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py
@@ -0,0 +1,249 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Dict, List, Set, Type, Union
+
+import dace
+from dace import SDFG, SDFGState, properties
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.sdfg.graph import Graph, NodeT
+from dace.transformation import pass_pipeline as ppl, transformation
+
+# Placeholder for the GPU stream variable used in tasklet code
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class NaiveGPUStreamScheduler(ppl.Pass):
+    """
+    Assigns GPU streams to nodes and stores the assignments in a dictionary.
+    This can be useful for enabling asynchronous and parallel GPU computation using GPU streams.
+
+    Strategy Overview:
+    ------------------
+    - GPU stream assignment is based on weakly connected components (WCCs) within each state.
+    - Nodes in the same WCC are assigned to the same stream.
+    - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0).
+    - In nested SDFGs:
+        * Stream assignment is inherited from the parent component,
+        * All internal components share the parent's stream.
+    - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration.
+
+    Example:
+    --------
+    A state with the following independent chains:
+        K1 → K2
+        K3 → K4 → K5
+        K6
+
+    would be scheduled as:
+        K1, K2     → stream 0
+        K3, K4, K5 → stream 1
+        K6         → stream 2
+
+    (assuming no limit on the number of concurrent streams)
+
+    Note:
+    -----
+    These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams.
+    """
+
+    def __init__(self):
+        # Maximum number of concurrent streams allowed (from config).
+        # Cached locally for frequent reuse.
+        self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams'))
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Nothing
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]:
+        """
+        Assigns GPU streams to nodes within the given SDFG.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The top-level SDFG to process.
+        pipeline_results : Dict
+            Unused.
+
+        Returns
+        -------
+        Dict[nodes.Node, int]
+            A dictionary mapping each node to its assigned GPU stream.
+        """
+        stream_assignments: Dict[nodes.Node, int] = dict()
+        for state in sdfg.states():
+            self._assign_gpu_streams_in_state(sdfg, False, state, stream_assignments, 0)
+
+        return stream_assignments
+
+    def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState,
+                                     stream_assignments: Dict[nodes.Node, int], gpu_stream: int) -> None:
+        """
+        Assigns GPU streams to nodes in a single state.
+
+        If inside a nested SDFG, components inherit the parent's stream.
+        Otherwise, each connected component gets a different stream.
+        Nested SDFGs are processed recursively.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG containing the state.
+        in_nested_sdfg : bool
+            True if the state is in a nested SDFG.
+        state : SDFGState
+            The state to process.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to assigned GPU streams (updated in-place).
+        gpu_stream : int
+            The current GPU stream ID.
+
+        Returns
+        -------
+        None
+        """
+        components = self._get_weakly_connected_nodes(state)
+
+        for component in components:
+
+            if not self._requires_gpu_stream(state, component):
+                continue
+
+            nodes_assigned_before = len(stream_assignments)
+
+            for node in component:
+                stream_assignments[node] = gpu_stream
+                if isinstance(node, nodes.NestedSDFG):
+                    for nested_state in node.sdfg.states():
+                        self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, stream_assignments, gpu_stream)
+
+            # Move to the next stream if we have assigned streams to any node in this component
+            # (careful: if nested, states are in same component)
+            if not in_nested_sdfg and len(stream_assignments) > nodes_assigned_before:
+                gpu_stream = self._next_stream(gpu_stream)
+
+    def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]:
+        """
+        Returns all weakly connected components in the given directed graph.
+
+        A weakly connected component is a maximal group of nodes such that each pair
+        of nodes is connected by a path when ignoring edge directions.
+
+        Parameters
+        ----------
+        graph: Graph
+            A directed graph instance.
+
+        Returns
+        -------
+        List[Set[Node_T]]
+
+            A list containing sets of nodes, with each set corresponding to a weakly
+            connected component.
+        """
+        visited: Set[NodeT] = set()
+        components: List[Set[NodeT]] = []
+
+        for node in graph.nodes():
+            if node in visited:
+                continue
+
+            # Start a new weakly connected component
+            component: Set[NodeT] = set()
+            stack = [node]
+
+            while stack:
+                current = stack.pop()
+                if current in visited:
+                    continue
+
+                visited.add(current)
+                component.add(current)
+
+                for neighbor in graph.neighbors(current):
+                    if neighbor not in visited:
+                        stack.append(neighbor)
+
+            components.append(component)
+
+        return components
+
+    def _next_stream(self, gpu_stream: int) -> int:
+        """
+        Compute the next CUDA stream index according to the concurrency configuration.
+
+        Behavior depends on the configured max_concurrent_streams value:
+        - If 0: unlimited streams allowed, so increment the stream index by one.
+        - If -1: default setting, always return stream 0 (no concurrency).
+        - Otherwise: cycle through stream indices from 0 up to max_concurrent_streams - 1.
+
+        Parameters
+        ----------
+        gpu_stream : int
+            The current CUDA stream index.
+
+        Returns
+        -------
+        int
+            The next CUDA stream index based on the concurrency policy.
+        """
+        if self._max_concurrent_streams == 0:
+            return gpu_stream + 1
+        elif self._max_concurrent_streams == -1:
+            return 0
+        else:
+            return (gpu_stream + 1) % self._max_concurrent_streams
+
+    def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool:
+        """
+        Check whether a connected component in an SDFG state should be assigned
+        a GPU stream.
+
+        A component requires a GPU stream if it contains at least one of:
+        - An AccessNode with GPU global memory storage,
+        - A MapEntry scheduled on a GPU device,
+        - A Tasklet whose code includes the stream placeholder.
+
+        Parameters
+        ----------
+        state : SDFGState
+            The state containing the component.
+        component : Set[NodeT]
+            The set of nodes that form the connected component.
+
+        Returns
+        -------
+        bool
+            True if the component requires a GPU stream, False otherwise.
+        """
+
+        def gpu_relevant(node, parent) -> bool:
+            if (isinstance(node, nodes.AccessNode) and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global):
+                return True
+
+            elif (isinstance(node, nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device):
+                return True
+
+            elif (isinstance(node, nodes.Tasklet) and STREAM_PLACEHOLDER in node.code.as_string):
+                return True
+
+            return False
+
+        for node in component:
+            if isinstance(node, nodes.NestedSDFG):
+                if any(gpu_relevant(node, parent) for node, parent in node.sdfg.all_nodes_recursive()):
+                    return True
+
+            else:
+                if gpu_relevant(node, state):
+                    return True
+
+        return False
diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py
new file mode 100644
index 0000000000..7e1a62b29c
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py
@@ -0,0 +1,274 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import SDFG, SDFGState, dtypes, properties
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets
+from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets
+from dace.transformation.passes.gpu_specialization.insert_gpu_copy_tasklet import InsertGPUCopyTasklets
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class GPUStreamTopologySimplification(ppl.Pass):
+    """
+    Simplifies an SDFG after GPU stream nodes have been added.
+
+    This pass is optional; the SDFG works without it, but it cleans up
+    the topology by merging adjacent or redundant GPU stream AccessNodes.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        depending_passes = {
+            NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets,
+            InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets
+        }
+
+        return depending_passes
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        """
+        Simplify the SDFG topology by merging adjacent GPU stream nodes.
+        """
+        self._merge_close_gpustream_nodes(sdfg)
+
+        self._merge_gpustreams_special_case(sdfg)
+        return {}
+
+    def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None:
+        """
+        Merge "close" GPU stream AccessNodes in the SDFG.
+
+        This function looks for a predecessor GPU stream AccessNode that can be merged
+        with any successor GPU stream AccessNodes of its grand-predecessors.
+
+        Example:
+
+        Consider two GPU copy tasklets connected via distinct GPU stream AccessNodes:
+        the corresponding subgraph looks like this:
+
+                 -> Sink GPU             Source GPU ->
+                ¦                                     ¦
+            Tasklet ------> Data AccessNode -----> Tasklet
+
+        This function would merge the sink and source node to simplify the SDFG.
+        """
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Skip AccessNodes
+                    if isinstance(node, nodes.AccessNode):
+                        continue
+
+                    # Find GPU stream AccessNode predecessors with no incoming edges
+                    # (i.e. source GPU stream AccessNodes)
+                    node_predecessors = state.predecessors(node)
+                    preceeding_gpustream_sources = [
+                        pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode)
+                        and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0
+                    ]
+
+                    # Skip if there are no preceding GPU stream sources
+                    if len(preceeding_gpustream_sources) == 0:
+                        continue
+
+                    # If multiple GPU stream sources exist, merge them; otherwise, use the single source
+                    if len(preceeding_gpustream_sources) > 1:
+                        combined_stream_node = preceeding_gpustream_sources.pop()
+                        for preceeding_gpu_stream in preceeding_gpustream_sources:
+                            # Note: there are no ingoing edges
+                            for out_edge in state.out_edges(preceeding_gpu_stream):
+                                _, src_conn, dst, dst_conn, data = out_edge
+                                state.add_edge(combined_stream_node, src_conn, dst, dst_conn, data)
+                                state.remove_edge(out_edge)
+                            state.remove_node(preceeding_gpu_stream)
+
+                    else:
+                        combined_stream_node = preceeding_gpustream_sources.pop()
+
+                    # Merge grand-predecessors' successors sink GPU streams with predecessor source GPU stream
+                    node_grand_predecessors = [
+                        grand_pred for pred in node_predecessors for grand_pred in state.predecessors(pred)
+                    ]
+                    node_gp_successors_streams = [
+                        succ_of_gp for gp in node_grand_predecessors for succ_of_gp in state.successors(gp)
+                        if isinstance(succ_of_gp, nodes.AccessNode)
+                        and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ_of_gp) == 0
+                    ]
+
+                    # remove duplicates
+                    node_gp_successors_streams = list(set(node_gp_successors_streams))
+
+                    for gp_succ_stream in node_gp_successors_streams:
+                        for edge in state.in_edges(gp_succ_stream):
+                            src, src_conn, _, dst_conn, data = edge
+                            state.add_edge(src, src_conn, combined_stream_node, dst_conn, data)
+                            state.remove_edge(edge)
+                        # Note: the grand-predecessor's successor GPU stream is a sink node and has no
+                        # outgoing edges
+                        state.remove_node(gp_succ_stream)
+
+    def _merge_gpustreams_special_case(self, sdfg: SDFG) -> None:
+        """
+        Special-case simplification of GPU stream AccessNodes.
+
+        This pass detects the following pattern:
+        - A GPU stream AccessNode `X` has a predecessor and a successor (i.e. at least one of both).
+        - Between the predecessor and successor lie one or more tasklets.
+        - These tasklets use their own distinct GPU stream AccessNodes (not `X`),
+          which are connected only to the tasklet itself.
+
+        To simplify the topology, redundant streams are merged:
+        - A single unified input GPU stream connects to the predecessor and replaces (merges)
+          the per-tasklet input streams.
+        - A single unified output GPU stream connects to the successor and replaces (merges)
+          the per-tasklet output streams.
+
+
+        The simplification is easier to understand visually than in words.
+        Inspect the intermediate SDFGs produced by the minimal example below
+        to see the effect of the stream merging.
+
+        Example
+        -------
+            @dace.program
+            def example(A: dace.uint32[128], B: dace.uint32[128],
+                        C: dace.uint32[128], D: dace.uint32[128]):
+                for i in dace.map[0:128:1]:
+                    B[i] = A[i]
+                for i in dace.map[0:128:1]:
+                    D[i] = C[i]
+
+            sdfg = example.to_sdfg()
+            sdfg.apply_gpu_transformations()
+        """
+        # Get the name of the GPU stream arry
+        gpustream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+
+        #------------------------- Preprocess: Gather Information ----------------------------
+
+        # For each GPU Stream AccessNode having a predecessor and a successor:
+        # Determine with which Tasklet Source and which Tasklet sink nodes lie between its predecessor
+        # and its successor
+        merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict()
+        merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict()
+
+        for node, state in sdfg.all_nodes_recursive():
+
+            # Skip non-tasklets
+            if not isinstance(node, nodes.Tasklet):
+                continue
+
+            # The tasklets of interest should have exactly one preceeding source GPU node and one following sink GPU node
+            # If not, we skip
+            node_predecessors = state.predecessors(node)
+            node_successors = state.successors(node)
+            downstream_gpustream_sinks = [
+                succ for succ in node_successors if isinstance(succ, nodes.AccessNode)
+                and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0
+            ]
+            upstream_gpustream_sources = [
+                pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode)
+                and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0
+            ]
+
+            # Skip not considered case
+            if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks)
+                    and len(upstream_gpustream_sources) == 1):
+                continue
+
+            # Look for potential predecessor of a "passthrough" GPU Stream AccessNode
+            # which would also be the grand-predeccessor of the current node (=tasklet)
+            candidate_predecessor = []
+            for pred in node_predecessors:
+                for grand_pred in state.predecessors(pred):
+
+                    # Current nodes grand pred is a candidate of a predecessor of a "passthrough" GPU Stream AccessNode
+                    candidate = grand_pred
+
+                    # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors
+                    if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule
+                            == dtypes.ScheduleType.GPU_Device or isinstance(candidate, nodes.Tasklet)):
+                        continue
+
+                    has_passthrough_gpustream = any(
+                        (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) and (
+                            state.in_degree(succ) > 0 and state.out_degree(succ) > 0)
+                        for succ in state.successors(candidate))
+
+                    if has_passthrough_gpustream:
+                        candidate_predecessor.append(candidate)
+
+            # Not "close" passthrough GPU node exists if no candidate predecessor exists
+            if len(candidate_predecessor) == 0:
+                continue
+
+            # Niche case, more than one "close" passthrough GPU node exists: Out of scope
+            # Ignore this case (note: This Pass only makes the Graph visually nicer, so skipping has
+            # no effect on correctness)
+            if len(candidate_predecessor) > 1:
+                continue
+
+            # Get the Kernel Exits GPU stream
+            candidate_predecessor = candidate_predecessor[0]
+            passthrough_gpu_node = [
+                succ for succ in state.successors(candidate_predecessor)
+                if isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t
+            ][0]
+
+            # Collect and store the GPU stream merging information
+            pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0]  # Note: Len is 1
+            succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0]  # Note: Len is 1
+            if (passthrough_gpu_node, state) in merge_source_gpustream:
+                merge_source_gpustream[(passthrough_gpu_node, state)].append(pre_gpustream)
+                merge_sink_gpustream[(passthrough_gpu_node, state)].append(succ_gpustream)
+            else:
+                merge_source_gpustream[(passthrough_gpu_node, state)] = [pre_gpustream]
+                merge_sink_gpustream[(passthrough_gpu_node, state)] = [succ_gpustream]
+
+        #------------------------- Merge the GPU Stream AccessNodes ----------------------------
+        for passthrough_gpu_node, state in merge_sink_gpustream.keys():
+
+            # Add new AccessNodes which merge the other loose streams
+            unified_in_stream = state.add_access(gpustream_array_name)
+            unified_out_stream = state.add_access(gpustream_array_name)
+
+            for in_edge in state.in_edges(passthrough_gpu_node):
+                src, src_conn, _, dst_conn, memlet = in_edge
+                state.add_edge(src, src_conn, unified_in_stream, dst_conn, copy.deepcopy(memlet))
+                state.remove_edge(in_edge)
+
+            for out_edge in state.out_edges(passthrough_gpu_node):
+                _, src_conn, dst, dst_conn, memlet = out_edge
+                state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet))
+                state.remove_edge(out_edge)
+
+            for source_stream in merge_source_gpustream[passthrough_gpu_node, state]:
+                for out_edge in state.out_edges(source_stream):
+                    _, src_conn, dst, dst_conn, memlet = out_edge
+                    state.add_edge(unified_in_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet))
+                    state.remove_edge(out_edge)
+                state.remove_node(source_stream)
+
+            for sink_stream in merge_sink_gpustream[passthrough_gpu_node, state]:
+                for in_edge in state.in_edges(sink_stream):
+                    src, src_conn, _, dst_conn, memlet = in_edge
+                    state.add_edge(src, src_conn, unified_out_stream, dst_conn, copy.deepcopy(memlet))
+                    state.remove_edge(in_edge)
+                state.remove_node(sink_stream)
+
+            state.remove_node(passthrough_gpu_node)
diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py
new file mode 100644
index 0000000000..162aa6143f
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py
@@ -0,0 +1,166 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import SDFG, SDFGState, dtypes, properties
+from dace import memlet as mm
+from dace.codegen.targets.gpu_helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy
+from dace.config import Config
+from dace.sdfg import nodes, scope_contains_scope
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets
+from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUCopyTasklets(ppl.Pass):
+    """
+    This pass inserts explicit copy tasklets for data transfers that need to be handled
+    by the GPU and occur outside a kernel (for example, copying data from host memory
+    to the GPU before executing a kernel).
+
+    It identifies such copy locations and inserts the corresponding tasklets. For each
+    memlet path describing a copy, the first edge is duplicated: one edge goes from the original
+    source to the tasklet, and the other from the tasklet to the original destination, while
+    the original edge is removed.
+
+    This is experimental and could later serve as inspiration for making all copies explicit.
+    Considerations for future work include allowing tasklets to access array addresses
+    from connectors and describing in memlets how data will be moved, since currently
+    tasklets only support value inputs.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        depending_passes = {
+            NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets,
+            InsertGPUStreamSyncTasklets
+        }
+        return depending_passes
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Tasklets | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
+        """
+        Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling.
+        Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel
+        function.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to transform by adding out-of-kernel GPU copy tasklets.
+        pipeline_results : Dict[str, Any]
+            Results from previous transformation passes, including GPU stream assignments.
+
+        Returns
+        -------
+        dict
+            Currently returns an empty dictionary.
+        """
+        # Prepare GPU stream
+        gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+        gpustream_array_name, gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+
+        # Initialize the strategy for copies that occur outside of kernel execution
+        out_of_kernel_copy = OutOfKernelCopyStrategy()
+
+        # Get all data copies to process the out of kernel copies
+        copy_worklist = self.find_all_data_copies(sdfg)
+
+        for copy_sdfg, state, src_node, dst_node, edge in copy_worklist:
+
+            copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments)
+
+            # Only insert copy tasklets for GPU related copies occuring out of the
+            # kernel (i.e. a GPU_device scheduled map)
+            if not out_of_kernel_copy.applicable(copy_context):
+                continue
+
+            # Generatae the copy call
+            code = out_of_kernel_copy.generate_copy(copy_context)
+
+            # Prepare GPU ustream connectors and the stream to be accessed from the
+            # GPU stream array
+            gpustream_id = gpustream_assignments[dst_node]
+            gpustream_var_name = f"{gpustream_var_name_prefix}{gpustream_id}"
+            accessed_gpustream = f"{gpustream_array_name}[{gpustream_id}]"
+
+            # Create the tasklet and add GPU stream related connectors
+            tasklet = state.add_tasklet("gpu_copy", {}, {}, code, language=dtypes.Language.CPP)
+            tasklet.add_in_connector(gpustream_var_name, dtypes.gpuStream_t, True)
+            tasklet.add_out_connector(gpustream_var_name, dtypes.gpuStream_t, True)
+
+            # Add incoming and outgoing GPU stream accessNodes to the tasklet
+            in_gpustream = state.add_access(gpustream_array_name)
+            out_gpustream = state.add_access(gpustream_array_name)
+            state.add_edge(in_gpustream, None, tasklet, gpustream_var_name, dace.Memlet(accessed_gpustream))
+            state.add_edge(tasklet, gpustream_var_name, out_gpustream, None, dace.Memlet(accessed_gpustream))
+
+            # Put the tasklet in between the edge
+            dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge
+            state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet))
+            state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet))
+            state.remove_edge(edge)
+
+        return {}
+
+    def find_all_data_copies(
+            self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]:
+        """
+        Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node,
+        destination node, and the first memlet edge of in the memlet path between source and destination node.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to analyze for potential data copies.
+
+        Returns
+        -------
+        List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]
+            A list of tuples representing the data copy, each containing:
+            - The SDFG containing the copy
+            - The state in which the copy occurs
+            - The source node of the copy
+            - The destination node of the copy
+            - The first memlet edge representing the data movement
+        """
+        copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = []
+        visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set()
+
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+            for state in sub_sdfg.states():
+                for edge in state.edges():
+
+                    # Skip edges that were already processed
+                    if edge in visited_edges:
+                        continue
+
+                    # Get the memlet path and mark all edges in the path as visited
+                    memlet_path = state.memlet_path(edge)
+                    visited_edges.update(set(memlet_path))
+
+                    # Get source and destination noces
+                    first_edge = memlet_path[0]
+                    last_edge = memlet_path[-1]
+                    src_node = first_edge.src
+                    dst_node = last_edge.dst
+
+                    # Skip empty memlets
+                    if first_edge.data.subset is None:
+                        continue
+
+                    # Add copy to the worklist
+                    copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge))
+
+        return copy_worklist
diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py
new file mode 100644
index 0000000000..2d2c1137de
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py
@@ -0,0 +1,290 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import dtypes, properties, SDFG, SDFGState
+from dace.codegen import common
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.helpers import is_within_schedule_types
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets
+
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUStreamSyncTasklets(ppl.Pass):
+    """
+    Inserts GPU stream synchronization tasklets in an SDFG where needed.
+
+    This pass uses a heuristic approach to find locations matching specific patterns
+    that require synchronization. Additional locations can be added easily if new
+    cases are discovered.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {
+            NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets
+        }
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Tasklets | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        """
+        Inserts GPU stream synchronization tasklets at required locations
+        after certain nodes and at the end of a state, for GPU streams used in the state.
+        """
+        stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Get sync locations
+        sync_state, sync_node = self._identify_sync_locations(sdfg, stream_assignments)
+
+        # Synchronize after a node when required
+        self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments)
+
+        # Synchronize all used streams at the end of a state
+        self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments)
+        return {}
+
+    def _identify_sync_locations(
+            self, sdfg: SDFG,
+            stream_assignments: Dict[nodes.Node, int]) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]:
+        """
+        Heuristically identifies GPU stream synchronization points in an SDFG.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to analyze.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to their assigned GPU stream ids.
+
+        Returns
+        -------
+        Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]
+            - **sync_state**: Maps each state to the set of stream IDs that should be
+                              synchronized at the end of the state.
+            - **sync_node**: The keys of this dictionary are nodes after which synchronization
+                             is needed, and their corresponding value is the state they belong to.
+        """
+
+        # ------------------ Helper predicates -----------------------------
+
+        def is_gpu_global_accessnode(node, state):
+            return isinstance(node, nodes.AccessNode) and node.desc(
+                state.parent).storage == dtypes.StorageType.GPU_Global
+
+        def is_nongpu_accessnode(node, state):
+            return isinstance(node, nodes.AccessNode) and node.desc(
+                state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN
+
+        def is_kernel_exit(node):
+            return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device
+
+        def is_sink_node(node, state):
+            return state.out_degree(node) == 0
+
+        def edge_within_kernel(state, src, dst):
+            gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN
+            src_in_kernel = is_within_schedule_types(state, src, gpu_schedules)
+            dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules)
+            return src_in_kernel and dst_in_kernel
+
+        def is_tasklet_with_stream_use(src):
+            return isinstance(src, nodes.Tasklet) and STREAM_PLACEHOLDER in src.code.as_string
+
+        # ------------------ Sync detection logic -----------------------------
+
+        sync_state: Dict[SDFGState, Set[int]] = {}
+        sync_node: Dict[nodes.Node, SDFGState] = {}
+
+        for edge, state in sdfg.all_edges_recursive():
+            src, dst = edge.src, edge.dst
+
+            # Ensure state is initialized in sync_state
+            if state not in sync_state:
+                sync_state[state] = set()
+
+            # --- Heuristics for when to sync ---
+            if (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state)
+                    and not edge_within_kernel(state, src, dst)):
+                sync_state[state].add(stream_assignments[dst])
+
+            elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state)
+                  and not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)):
+                sync_node[dst] = state
+                sync_state[state].add(stream_assignments[dst])
+
+            elif (is_nongpu_accessnode(src, state) and is_gpu_global_accessnode(dst, state)
+                  and not edge_within_kernel(state, src, dst)):
+                sync_state[state].add(stream_assignments[dst])
+
+            elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and not is_sink_node(dst, state)):
+                sync_state[state].add(stream_assignments[src])
+                sync_state[state].add(stream_assignments[src])
+
+            elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and is_sink_node(dst, state)):
+                sync_state[state].add(stream_assignments[dst])
+
+            elif is_tasklet_with_stream_use(src):
+                sync_state[state].add(stream_assignments[src])
+
+            else:
+                continue
+
+            # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side
+            if not isinstance(state, SDFGState):
+                raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. "
+                                          "Expected 'SDFGState'. Please handle this case explicitly.")
+
+        # Remove states with no syncs
+        sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0}
+
+        return sync_state, sync_node
+
+    def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]],
+                                             stream_assignments: Dict[nodes.Node, int]) -> None:
+        """
+        Inserts GPU stream synchronization tasklets at the end of SDFG states.
+
+        For each state that requires synchronization, this method:
+
+        1. Generates a tasklet that synchronizes all assigned GPU streams using
+           the appropriate backend (e.g., CUDA).
+        2. Ensures all other operations in the state complete before synchronization
+           by connecting all sink nodes to the tasklet.
+        3. Guarantees that only a single GPU stream AccessNode connects to the sync
+           tasklet, creating one if needed.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The top level SDFG.
+        sync_state : Dict[SDFGState, Set[int]
+            Mapping of states to sets of stream IDs that require synchronization at the end of the state.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to their assigned GPU stream IDs.
+        """
+        # Prepare GPU stream info and backend
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+        backend: str = common.get_gpu_backend()
+
+        for state, streams in sync_state.items():
+
+            #----------------- Generate GPU stream synchronization Tasklet -----------------
+
+            # Build synchronization calls for all streams used in this state
+            sync_code_lines = []
+            for stream in streams:
+                gpu_stream_var_name = f"{stream_var_name_prefix}{stream}"
+                sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({gpu_stream_var_name}));"
+                sync_code_lines.append(sync_call)
+            sync_code = "\n".join(sync_code_lines)
+
+            # Create the tasklet
+            tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization",
+                                        inputs=set(),
+                                        outputs=set(),
+                                        code=sync_code,
+                                        language=dtypes.Language.CPP)
+
+            # ----------------- Connect sink nodes to the synchronization tasklet -----------------
+
+            # 1. Seperate GPU stream sink nodes and other sink nodes
+            stream_sink_nodes: List[nodes.AccessNode] = []
+            non_stream_sink_nodes: List[nodes.Node] = []
+            for sink_node in state.sink_nodes():
+                if isinstance(sink_node, nodes.AccessNode) and sink_node.desc(state).dtype == dtypes.gpuStream_t:
+                    stream_sink_nodes.append(sink_node)
+
+                elif sink_node != tasklet:
+                    non_stream_sink_nodes.append(sink_node)
+
+            # 2. Connect non-stream sink nodes to the sync tasklet
+            for sink_node in non_stream_sink_nodes:
+                state.add_edge(sink_node, None, tasklet, None, dace.Memlet())
+
+            # 3. Connect a single GPU stream sink node (create or merge if needed)
+            if len(stream_sink_nodes) == 0:
+                combined_stream_node = state.add_access(stream_array_name)
+
+            else:
+                combined_stream_node = stream_sink_nodes.pop()
+                for stream_node in stream_sink_nodes:
+                    for edge in state.in_edges(stream_node):
+                        state.add_edge(edge.src, edge.src_conn, combined_stream_node, edge.dst_conn, edge.data)
+                        state.remove_edge(edge)
+                    state.remove_node(stream_node)
+
+            # Connect back to output stream node
+            output_stream_node = state.add_access(combined_stream_node.data)
+            for stream in streams:
+                accessed_gpu_stream = f"{stream_array_name}[{stream}]"
+                conn = f"{stream_var_name_prefix}{stream}"  # Note: Same as "gpu_stream_var_name" from tasklet
+
+                tasklet.add_in_connector(conn, dtypes.gpuStream_t)
+                tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True)
+                state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream))
+                state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream))
+
+    def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState],
+                                           stream_assignments: Dict[nodes.Node, int]) -> None:
+        """
+        Insert a GPU stream synchronization tasklet immediately after specified nodes.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The top level SDFG.
+        sync_node : Dict[nodes.Node, SDFGState]
+            Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to their assigned GPU stream IDs.
+        """
+        # Prepare GPU stream info and backend
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+        backend: str = common.get_gpu_backend()
+
+        for node, state in sync_node.items():
+
+            #----------------- Generate GPU stream synchronization Tasklet -----------------
+
+            # Get assigned GPU stream
+            stream = stream_assignments.get(node, "nullptr")
+            if stream == "nullptr":
+                raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.")
+
+            # Create the tasklet
+            stream_var_name = f"{stream_var_name_prefix}{stream}"
+            sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n"
+            tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization",
+                                        inputs=set(),
+                                        outputs=set(),
+                                        code=sync_call,
+                                        language=dtypes.Language.CPP)
+
+            #----------------- Place tasklet between node and successors, link GPU streams ----------------
+
+            # 1. Put the tasklet between the node and its successors
+            for succ in state.successors(node):
+                state.add_edge(tasklet, None, succ, None, dace.Memlet())
+            state.add_edge(node, None, tasklet, None, dace.Memlet())
+
+            # 2. Connect tasklet to GPU stream AccessNodes
+            in_stream = state.add_access(stream_array_name)
+            out_stream = state.add_access(stream_array_name)
+            accessed_stream = f"{stream_array_name}[{stream}]"
+            state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream))
+            state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream))
+            tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True)
+            tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True)
diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py
new file mode 100644
index 0000000000..f45caa5dd0
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py
@@ -0,0 +1,154 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import SDFG, dtypes, properties
+from dace.config import Config
+from dace.sdfg import is_devicelevel_gpu
+from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, Node, Tasklet
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUStreamsToSDFGs(ppl.Pass):
+    """
+    Inserts a GPU stream array into the top-level SDFG and propagates it to all
+    nested SDFGs that require it, including intermediate SDFGs along the hierarchy.
+
+    This pass guarantees that every relevant SDFG has the array defined, avoiding
+    duplication and allowing subsequent passes in the GPU stream pipeline to rely
+    on its presence without redefining it.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        """
+        Ensure that a GPU stream array is available in all SDFGs that require it.
+
+        The pass creates the array once at the top-level SDFG and propagates it
+        down the hierarchy by inserting matching arrays in child SDFGs and wiring
+        them through nested SDFG connectors. This way, all SDFGs share a consistent
+        reference to the same GPU stream array.
+        """
+
+        # Extract stream array name and number of streams to allocate
+        stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+        stream_assignments: Dict[Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+        num_assigned_streams = max(stream_assignments.values(), default=0) + 1
+
+        # Add the GPU stream array at the top level
+        sdfg.add_transient(stream_array_name, (num_assigned_streams, ),
+                           dtype=dace.dtypes.gpuStream_t,
+                           storage=dace.dtypes.StorageType.Register)
+
+        # Ensure GPU stream array is defined where required
+        for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg):
+
+            # Skip if this child already has the array (inserted higher up in the hierarchy)
+            if stream_array_name in child_sdfg.arrays:
+                continue
+
+            # Add the array to the child SDFG
+            inner_sdfg = child_sdfg
+            inner_sdfg.add_array(stream_array_name, (num_assigned_streams, ),
+                                 dtype=dace.dtypes.gpuStream_t,
+                                 storage=dace.dtypes.StorageType.Register)
+
+            # Walk up the hierarchy until the array is found, inserting it into each parent
+            outer_sdfg = inner_sdfg.parent_sdfg
+            while stream_array_name not in outer_sdfg.arrays:
+
+                # Insert array in parent SDFG
+                outer_sdfg.add_array(stream_array_name, (num_assigned_streams, ),
+                                     dtype=dace.dtypes.gpuStream_t,
+                                     storage=dace.dtypes.StorageType.Register)
+
+                # Connect parent SDFG array to nested SDFG node
+                inner_nsdfg_node = inner_sdfg.parent_nsdfg_node
+                inner_parent_state = inner_sdfg.parent
+                inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t)
+                inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name)
+                inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name,
+                                            dace.Memlet(stream_array_name))
+
+                # Continue climbing up the hierarchy
+                inner_sdfg = outer_sdfg
+                outer_sdfg = outer_sdfg.parent_sdfg
+
+            # Ensure final connection from the first parent that had the array down to this SDFG
+            inner_nsdfg_node = inner_sdfg.parent_nsdfg_node
+            inner_parent_state = inner_sdfg.parent
+            inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t)
+            inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name)
+            inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name,
+                                        dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]"))
+
+            outer_sdfg = inner_sdfg.parent_sdfg
+
+        return {}
+
+    def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]:
+        """
+        Identify all child SDFGs that require a GPU stream array in their
+        array descriptor store. A child SDFG requires a GPU stream if:
+
+        - It launches GPU kernels (MapEntry/MapExit with GPU_Device schedule).
+        - It contains special Tasklets (e.g., from library node expansion) that
+          use the GPU stream they are assigned to in the code.
+        - It accesses GPU global memory outside device-level GPU scopes, which
+          implies memory copies or kernel data feeds.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The root SDFG to inspect.
+
+        Returns
+        -------
+        Set[SDFG]
+            The set of child SDFGs that need a GPU stream array in their array descriptor
+            store.
+        """
+        requiring_gpu_stream = set()
+        for child_sdfg in sdfg.all_sdfgs_recursive():
+
+            # Skip the root SDFG itself
+            if child_sdfg is sdfg:
+                continue
+
+            for state in child_sdfg.states():
+                for node in state.nodes():
+
+                    # Case 1: Kernel launch nodes
+                    if isinstance(node, (MapEntry, MapExit)) and node.map.schedule == dtypes.ScheduleType.GPU_Device:
+                        requiring_gpu_stream.add(child_sdfg)
+                        break
+
+                    # Case 2: Tasklets that use GPU stream in their code
+                    if isinstance(node, Tasklet) and STREAM_PLACEHOLDER in node.code.as_string:
+                        requiring_gpu_stream.add(child_sdfg)
+                        break
+
+                    # Case 3: Accessing GPU global memory outside device-level scopes
+                    if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Global
+                            and not is_devicelevel_gpu(state.sdfg, state, node)):
+                        requiring_gpu_stream.add(child_sdfg)
+                        break
+
+                # Stop scanning this SDFG once a reason is found
+                if child_sdfg in requiring_gpu_stream:
+                    break
+
+        return requiring_gpu_stream
diff --git a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py
new file mode 100644
index 0000000000..bd913ae469
--- /dev/null
+++ b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py
@@ -0,0 +1,274 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import SDFG, SDFGState, dtypes, properties
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets
+from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class GPUStreamTopologySimplification(ppl.Pass):
+    """
+    Simplifies an SDFG after GPU stream nodes have been added.
+
+    This pass is optional; the SDFG works without it, but it cleans up
+    the topology by merging adjacent or redundant GPU stream AccessNodes.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        depending_passes = {
+            NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets,
+            InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets
+        }
+
+        return depending_passes
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        """
+        Simplify the SDFG topology by merging adjacent GPU stream nodes.
+        """
+        self._merge_close_gpustream_nodes(sdfg)
+
+        self._merge_gpustreams_special_case(sdfg)
+        return {}
+
+    def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None:
+        """
+        Merge "close" GPU stream AccessNodes in the SDFG.
+
+        This function looks for a predecessor GPU stream AccessNode that can be merged
+        with any successor GPU stream AccessNodes of its grand-predecessors.
+
+        Example:
+
+        Consider two GPU copy tasklets connected via distinct GPU stream AccessNodes:
+        the corresponding subgraph looks like this:
+
+                 -> Sink GPU             Source GPU ->
+                ¦                                     ¦
+            Tasklet ------> Data AccessNode -----> Tasklet
+
+        This function would merge the sink and source node to simplify the SDFG.
+        """
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Skip AccessNodes
+                    if isinstance(node, nodes.AccessNode):
+                        continue
+
+                    # Find GPU stream AccessNode predecessors with no incoming edges
+                    # (i.e. source GPU stream AccessNodes)
+                    node_predecessors = state.predecessors(node)
+                    preceeding_gpustream_sources = [
+                        pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode)
+                        and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0
+                    ]
+
+                    # Skip if there are no preceding GPU stream sources
+                    if len(preceeding_gpustream_sources) == 0:
+                        continue
+
+                    # If multiple GPU stream sources exist, merge them; otherwise, use the single source
+                    if len(preceeding_gpustream_sources) > 1:
+                        combined_stream_node = preceeding_gpustream_sources.pop()
+                        for preceeding_gpu_stream in preceeding_gpustream_sources:
+                            # Note: there are no ingoing edges
+                            for out_edge in state.out_edges(preceeding_gpu_stream):
+                                _, src_conn, dst, dst_conn, data = out_edge
+                                state.add_edge(combined_stream_node, src_conn, dst, dst_conn, data)
+                                state.remove_edge(out_edge)
+                            state.remove_node(preceeding_gpu_stream)
+
+                    else:
+                        combined_stream_node = preceeding_gpustream_sources.pop()
+
+                    # Merge grand-predecessors' successors sink GPU streams with predecessor source GPU stream
+                    node_grand_predecessors = [
+                        grand_pred for pred in node_predecessors for grand_pred in state.predecessors(pred)
+                    ]
+                    node_gp_successors_streams = [
+                        succ_of_gp for gp in node_grand_predecessors for succ_of_gp in state.successors(gp)
+                        if isinstance(succ_of_gp, nodes.AccessNode)
+                        and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ_of_gp) == 0
+                    ]
+
+                    # remove duplicates
+                    node_gp_successors_streams = list(set(node_gp_successors_streams))
+
+                    for gp_succ_stream in node_gp_successors_streams:
+                        for edge in state.in_edges(gp_succ_stream):
+                            src, src_conn, _, dst_conn, data = edge
+                            state.add_edge(src, src_conn, combined_stream_node, dst_conn, data)
+                            state.remove_edge(edge)
+                        # Note: the grand-predecessor's successor GPU stream is a sink node and has no
+                        # outgoing edges
+                        state.remove_node(gp_succ_stream)
+
+    def _merge_gpustreams_special_case(self, sdfg: SDFG) -> None:
+        """
+        Special-case simplification of GPU stream AccessNodes.
+
+        This pass detects the following pattern:
+        - A GPU stream AccessNode `X` has a predecessor and a successor (i.e. at least one of both).
+        - Between the predecessor and successor lie one or more tasklets.
+        - These tasklets use their own distinct GPU stream AccessNodes (not `X`),
+          which are connected only to the tasklet itself.
+
+        To simplify the topology, redundant streams are merged:
+        - A single unified input GPU stream connects to the predecessor and replaces (merges)
+          the per-tasklet input streams.
+        - A single unified output GPU stream connects to the successor and replaces (merges)
+          the per-tasklet output streams.
+
+
+        The simplification is easier to understand visually than in words.
+        Inspect the intermediate SDFGs produced by the minimal example below
+        to see the effect of the stream merging.
+
+        Example
+        -------
+            @dace.program
+            def example(A: dace.uint32[128], B: dace.uint32[128],
+                        C: dace.uint32[128], D: dace.uint32[128]):
+                for i in dace.map[0:128:1]:
+                    B[i] = A[i]
+                for i in dace.map[0:128:1]:
+                    D[i] = C[i]
+
+            sdfg = example.to_sdfg()
+            sdfg.apply_gpu_transformations()
+        """
+        # Get the name of the GPU stream arry
+        gpustream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+
+        #------------------------- Preprocess: Gather Information ----------------------------
+
+        # For each GPU Stream AccessNode having a predecessor and a successor:
+        # Determine with which Tasklet Source and which Tasklet sink nodes lie between its predecessor
+        # and its successor
+        merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict()
+        merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict()
+
+        for node, state in sdfg.all_nodes_recursive():
+
+            # Skip non-tasklets
+            if not isinstance(node, nodes.Tasklet):
+                continue
+
+            # The tasklets of interest should have exactly one preceeding source GPU node and one following sink GPU node
+            # If not, we skip
+            node_predecessors = state.predecessors(node)
+            node_successors = state.successors(node)
+            downstream_gpustream_sinks = [
+                succ for succ in node_successors if isinstance(succ, nodes.AccessNode)
+                and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0
+            ]
+            upstream_gpustream_sources = [
+                pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode)
+                and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0
+            ]
+
+            # Skip not considered case
+            if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks)
+                    and len(upstream_gpustream_sources) == 1):
+                continue
+
+            # Look for potential predecessor of a "passthrough" GPU Stream AccessNode
+            # which would also be the grand-predeccessor of the current node (=tasklet)
+            candidate_predecessor = []
+            for pred in node_predecessors:
+                for grand_pred in state.predecessors(pred):
+
+                    # Current nodes grand pred is a candidate of a predecessor of a "passthrough" GPU Stream AccessNode
+                    candidate = grand_pred
+
+                    # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors
+                    if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule
+                            == dtypes.ScheduleType.GPU_Device or isinstance(candidate, nodes.Tasklet)):
+                        continue
+
+                    has_passthrough_gpustream = any(
+                        (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) and (
+                            state.in_degree(succ) > 0 and state.out_degree(succ) > 0)
+                        for succ in state.successors(candidate))
+
+                    if has_passthrough_gpustream:
+                        candidate_predecessor.append(candidate)
+
+            # Not "close" passthrough GPU node exists if no candidate predecessor exists
+            if len(candidate_predecessor) == 0:
+                continue
+
+            # Niche case, more than one "close" passthrough GPU node exists: Out of scope
+            # Ignore this case (note: This Pass only makes the Graph visually nicer, so skipping has
+            # no effect on correctness)
+            if len(candidate_predecessor) > 1:
+                continue
+
+            # Get the Kernel Exits GPU stream
+            candidate_predecessor = candidate_predecessor[0]
+            passthrough_gpu_node = [
+                succ for succ in state.successors(candidate_predecessor)
+                if isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t
+            ][0]
+
+            # Collect and store the GPU stream merging information
+            pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0]  # Note: Len is 1
+            succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0]  # Note: Len is 1
+            if (passthrough_gpu_node, state) in merge_source_gpustream:
+                merge_source_gpustream[(passthrough_gpu_node, state)].append(pre_gpustream)
+                merge_sink_gpustream[(passthrough_gpu_node, state)].append(succ_gpustream)
+            else:
+                merge_source_gpustream[(passthrough_gpu_node, state)] = [pre_gpustream]
+                merge_sink_gpustream[(passthrough_gpu_node, state)] = [succ_gpustream]
+
+        #------------------------- Merge the GPU Stream AccessNodes ----------------------------
+        for passthrough_gpu_node, state in merge_sink_gpustream.keys():
+
+            # Add new AccessNodes which merge the other loose streams
+            unified_in_stream = state.add_access(gpustream_array_name)
+            unified_out_stream = state.add_access(gpustream_array_name)
+
+            for in_edge in state.in_edges(passthrough_gpu_node):
+                src, src_conn, _, dst_conn, memlet = in_edge
+                state.add_edge(src, src_conn, unified_in_stream, dst_conn, copy.deepcopy(memlet))
+                state.remove_edge(in_edge)
+
+            for out_edge in state.out_edges(passthrough_gpu_node):
+                _, src_conn, dst, dst_conn, memlet = out_edge
+                state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet))
+                state.remove_edge(out_edge)
+
+            for source_stream in merge_source_gpustream[passthrough_gpu_node, state]:
+                for out_edge in state.out_edges(source_stream):
+                    _, src_conn, dst, dst_conn, memlet = out_edge
+                    state.add_edge(unified_in_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet))
+                    state.remove_edge(out_edge)
+                state.remove_node(source_stream)
+
+            for sink_stream in merge_sink_gpustream[passthrough_gpu_node, state]:
+                for in_edge in state.in_edges(sink_stream):
+                    src, src_conn, _, dst_conn, memlet = in_edge
+                    state.add_edge(src, src_conn, unified_out_stream, dst_conn, copy.deepcopy(memlet))
+                    state.remove_edge(in_edge)
+                state.remove_node(sink_stream)
+
+            state.remove_node(passthrough_gpu_node)
diff --git a/dace/transformation/passes/gpustream/gpustream_scheduling.py b/dace/transformation/passes/gpustream/gpustream_scheduling.py
new file mode 100644
index 0000000000..0ad3c2e7c0
--- /dev/null
+++ b/dace/transformation/passes/gpustream/gpustream_scheduling.py
@@ -0,0 +1,249 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Dict, List, Set, Type, Union
+
+import dace
+from dace import SDFG, SDFGState, properties
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.sdfg.graph import Graph, NodeT
+from dace.transformation import pass_pipeline as ppl, transformation
+
+# Placeholder for the GPU stream variable used in tasklet code
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class NaiveGPUStreamScheduler(ppl.Pass):
+    """
+    Assigns GPU streams to nodes and stores the assignments in a dictionary.
+    This can be useful for enabling asynchronous and parallel GPU computation using GPU streams.
+
+    Strategy Overview:
+    ------------------
+    - GPU stream assignment is based on weakly connected components (WCCs) within each state.
+    - Nodes in the same WCC are assigned to the same stream.
+    - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0).
+    - In nested SDFGs:
+        * Stream assignment is inherited from the parent component,
+        * All internal components share the parent's stream.
+    - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration.
+
+    Example:
+    --------
+    A state with the following independent chains:
+        K1 → K2
+        K3 → K4 → K5
+        K6
+
+    would be scheduled as:
+        K1, K2     → stream 0
+        K3, K4, K5 → stream 1
+        K6         → stream 2
+
+    (assuming no limit on the number of concurrent streams)
+
+    Note:
+    -----
+    These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams.
+    """
+
+    def __init__(self):
+        # Maximum number of concurrent streams allowed (from config).
+        # Cached locally for frequent reuse.
+        self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams'))
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Nothing
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]:
+        """
+        Assigns GPU streams to nodes within the given SDFG.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The top-level SDFG to process.
+        pipeline_results : Dict
+            Unused.
+
+        Returns
+        -------
+        Dict[nodes.Node, int]
+            A dictionary mapping each node to its assigned GPU stream.
+        """
+        stream_assignments: Dict[nodes.Node, int] = dict()
+        for state in sdfg.states():
+            self._assign_gpu_streams_in_state(sdfg, False, state, stream_assignments, 0)
+
+        return stream_assignments
+
+    def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState,
+                                     stream_assignments: Dict[nodes.Node, int], gpu_stream: int) -> None:
+        """
+        Assigns GPU streams to nodes in a single state.
+
+        If inside a nested SDFG, components inherit the parent's stream.
+        Otherwise, each connected component gets a different stream.
+        Nested SDFGs are processed recursively.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG containing the state.
+        in_nested_sdfg : bool
+            True if the state is in a nested SDFG.
+        state : SDFGState
+            The state to process.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to assigned GPU streams (updated in-place).
+        gpu_stream : int
+            The current GPU stream ID.
+
+        Returns
+        -------
+        None
+        """
+        components = self._get_weakly_connected_nodes(state)
+
+        for component in components:
+
+            if not self._requires_gpu_stream(state, component):
+                continue
+
+            nodes_assigned_before = len(stream_assignments)
+
+            for node in component:
+                stream_assignments[node] = gpu_stream
+                if isinstance(node, nodes.NestedSDFG):
+                    for nested_state in node.sdfg.states():
+                        self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, stream_assignments, gpu_stream)
+
+            # Move to the next stream if we have assigned streams to any node in this component
+            # (careful: if nested, states are in same component)
+            if not in_nested_sdfg and len(stream_assignments) > nodes_assigned_before:
+                gpu_stream = self._next_stream(gpu_stream)
+
+    def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]:
+        """
+        Returns all weakly connected components in the given directed graph.
+
+        A weakly connected component is a maximal group of nodes such that each pair
+        of nodes is connected by a path when ignoring edge directions.
+
+        Parameters
+        ----------
+        graph: Graph
+            A directed graph instance.
+
+        Returns
+        -------
+        List[Set[Node_T]]
+
+            A list containing sets of nodes, with each set corresponding to a weakly
+            connected component.
+        """
+        visited: Set[NodeT] = set()
+        components: List[Set[NodeT]] = []
+
+        for node in graph.nodes():
+            if node in visited:
+                continue
+
+            # Start a new weakly connected component
+            component: Set[NodeT] = set()
+            stack = [node]
+
+            while stack:
+                current = stack.pop()
+                if current in visited:
+                    continue
+
+                visited.add(current)
+                component.add(current)
+
+                for neighbor in graph.neighbors(current):
+                    if neighbor not in visited:
+                        stack.append(neighbor)
+
+            components.append(component)
+
+        return components
+
+    def _next_stream(self, gpu_stream: int) -> int:
+        """
+        Compute the next CUDA stream index according to the concurrency configuration.
+
+        Behavior depends on the configured max_concurrent_streams value:
+        - If 0: unlimited streams allowed, so increment the stream index by one.
+        - If -1: default setting, always return stream 0 (no concurrency).
+        - Otherwise: cycle through stream indices from 0 up to max_concurrent_streams - 1.
+
+        Parameters
+        ----------
+        gpu_stream : int
+            The current CUDA stream index.
+
+        Returns
+        -------
+        int
+            The next CUDA stream index based on the concurrency policy.
+        """
+        if self._max_concurrent_streams == 0:
+            return gpu_stream + 1
+        elif self._max_concurrent_streams == -1:
+            return 0
+        else:
+            return (gpu_stream + 1) % self._max_concurrent_streams
+
+    def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool:
+        """
+        Check whether a connected component in an SDFG state should be assigned
+        a GPU stream.
+
+        A component requires a GPU stream if it contains at least one of:
+        - An AccessNode with GPU global memory storage,
+        - A MapEntry scheduled on a GPU device,
+        - A Tasklet whose code includes the stream placeholder.
+
+        Parameters
+        ----------
+        state : SDFGState
+            The state containing the component.
+        component : Set[NodeT]
+            The set of nodes that form the connected component.
+
+        Returns
+        -------
+        bool
+            True if the component requires a GPU stream, False otherwise.
+        """
+
+        def gpu_relevant(node, parent) -> bool:
+            if (isinstance(node, nodes.AccessNode) and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global):
+                return True
+
+            elif (isinstance(node, nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device):
+                return True
+
+            elif (isinstance(node, nodes.Tasklet) and STREAM_PLACEHOLDER in node.code.as_string):
+                return True
+
+            return False
+
+        for node in component:
+            if isinstance(node, nodes.NestedSDFG):
+                if any(gpu_relevant(node, parent) for node, parent in node.sdfg.all_nodes_recursive()):
+                    return True
+
+            else:
+                if gpu_relevant(node, state):
+                    return True
+
+        return False
diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py
new file mode 100644
index 0000000000..b4a7b9a65d
--- /dev/null
+++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py
@@ -0,0 +1,288 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import dtypes, properties, SDFG, SDFGState
+from dace.codegen import common
+from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUStreamSyncTasklets(ppl.Pass):
+    """
+    Inserts GPU stream synchronization tasklets in an SDFG where needed.
+
+    This pass uses a heuristic approach to find locations matching specific patterns
+    that require synchronization. Additional locations can be added easily if new
+    cases are discovered.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Tasklets | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        """
+        Inserts GPU stream synchronization tasklets at required locations
+        after certain nodes and at the end of a state, for GPU streams used in the state.
+        """
+        stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Get sync locations
+        sync_state, sync_node = self._identify_sync_locations(sdfg, stream_assignments)
+
+        # Synchronize after a node when required
+        self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments)
+
+        # Synchronize all used streams at the end of a state
+        self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments)
+        return {}
+
+    def _identify_sync_locations(
+            self, sdfg: SDFG,
+            stream_assignments: Dict[nodes.Node, int]) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]:
+        """
+        Heuristically identifies GPU stream synchronization points in an SDFG.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to analyze.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to their assigned GPU stream ids.
+
+        Returns
+        -------
+        Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]
+            - **sync_state**: Maps each state to the set of stream IDs that should be
+                              synchronized at the end of the state.
+            - **sync_node**: The keys of this dictionary are nodes after which synchronization
+                             is needed, and their corresponding value is the state they belong to.
+        """
+
+        # ------------------ Helper predicates -----------------------------
+
+        def is_gpu_global_accessnode(node, state):
+            return isinstance(node, nodes.AccessNode) and node.desc(
+                state.parent).storage == dtypes.StorageType.GPU_Global
+
+        def is_nongpu_accessnode(node, state):
+            return isinstance(node, nodes.AccessNode) and node.desc(
+                state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN
+
+        def is_kernel_exit(node):
+            return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device
+
+        def is_sink_node(node, state):
+            return state.out_degree(node) == 0
+
+        def edge_within_kernel(state, src, dst):
+            gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN
+            src_in_kernel = is_within_schedule_types(state, src, gpu_schedules)
+            dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules)
+            return src_in_kernel and dst_in_kernel
+
+        def is_tasklet_with_stream_use(src):
+            return isinstance(src, nodes.Tasklet) and STREAM_PLACEHOLDER in src.code.as_string
+
+        # ------------------ Sync detection logic -----------------------------
+
+        sync_state: Dict[SDFGState, Set[int]] = {}
+        sync_node: Dict[nodes.Node, SDFGState] = {}
+
+        for edge, state in sdfg.all_edges_recursive():
+            src, dst = edge.src, edge.dst
+
+            # Ensure state is initialized in sync_state
+            if state not in sync_state:
+                sync_state[state] = set()
+
+            # --- Heuristics for when to sync ---
+            if (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state)
+                    and not edge_within_kernel(state, src, dst)):
+                sync_state[state].add(stream_assignments[dst])
+
+            elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state)
+                  and not edge_within_kernel(state, src, dst)):
+                sync_node[dst] = state
+                sync_state[state].add(stream_assignments[dst])
+
+            elif (is_nongpu_accessnode(src, state) and is_gpu_global_accessnode(dst, state)
+                  and not edge_within_kernel(state, src, dst)):
+                sync_state[state].add(stream_assignments[dst])
+
+            elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and not is_sink_node(dst, state)):
+                sync_state[state].add(stream_assignments[src])
+                sync_state[state].add(stream_assignments[src])
+
+            elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and is_sink_node(dst, state)):
+                sync_state[state].add(stream_assignments[dst])
+
+            elif is_tasklet_with_stream_use(src):
+                sync_state[state].add(stream_assignments[src])
+
+            else:
+                continue
+
+            # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side
+            if not isinstance(state, SDFGState):
+                raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. "
+                                          "Expected 'SDFGState'. Please handle this case explicitly.")
+
+        # Remove states with no syncs
+        sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0}
+
+        return sync_state, sync_node
+
+    def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]],
+                                             stream_assignments: Dict[nodes.Node, int]) -> None:
+        """
+        Inserts GPU stream synchronization tasklets at the end of SDFG states.
+
+        For each state that requires synchronization, this method:
+
+        1. Generates a tasklet that synchronizes all assigned GPU streams using
+           the appropriate backend (e.g., CUDA).
+        2. Ensures all other operations in the state complete before synchronization
+           by connecting all sink nodes to the tasklet.
+        3. Guarantees that only a single GPU stream AccessNode connects to the sync
+           tasklet, creating one if needed.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The top level SDFG.
+        sync_state : Dict[SDFGState, Set[int]
+            Mapping of states to sets of stream IDs that require synchronization at the end of the state.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to their assigned GPU stream IDs.
+        """
+        # Prepare GPU stream info and backend
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+        backend: str = common.get_gpu_backend()
+
+        for state, streams in sync_state.items():
+
+            #----------------- Generate GPU stream synchronization Tasklet -----------------
+
+            # Build synchronization calls for all streams used in this state
+            sync_code_lines = []
+            for stream in streams:
+                gpu_stream_var_name = f"{stream_var_name_prefix}{stream}"
+                sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({gpu_stream_var_name}));"
+                sync_code_lines.append(sync_call)
+            sync_code = "\n".join(sync_code_lines)
+
+            # Create the tasklet
+            tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization",
+                                        inputs=set(),
+                                        outputs=set(),
+                                        code=sync_code,
+                                        language=dtypes.Language.CPP)
+
+            # ----------------- Connect sink nodes to the synchronization tasklet -----------------
+
+            # 1. Seperate GPU stream sink nodes and other sink nodes
+            stream_sink_nodes: List[nodes.AccessNode] = []
+            non_stream_sink_nodes: List[nodes.Node] = []
+            for sink_node in state.sink_nodes():
+                if isinstance(sink_node, nodes.AccessNode) and sink_node.desc(state).dtype == dtypes.gpuStream_t:
+                    stream_sink_nodes.append(sink_node)
+
+                elif sink_node != tasklet:
+                    non_stream_sink_nodes.append(sink_node)
+
+            # 2. Connect non-stream sink nodes to the sync tasklet
+            for sink_node in non_stream_sink_nodes:
+                state.add_edge(sink_node, None, tasklet, None, dace.Memlet())
+
+            # 3. Connect a single GPU stream sink node (create or merge if needed)
+            if len(stream_sink_nodes) == 0:
+                combined_stream_node = state.add_access(stream_array_name)
+
+            else:
+                combined_stream_node = stream_sink_nodes.pop()
+                for stream_node in stream_sink_nodes:
+                    for edge in state.in_edges(stream_node):
+                        state.add_edge(edge.src, edge.src_conn, combined_stream_node, edge.dst_conn, edge.data)
+                        state.remove_edge(edge)
+                    state.remove_node(stream_node)
+
+            # Connect back to output stream node
+            output_stream_node = state.add_access(combined_stream_node.data)
+            for stream in streams:
+                accessed_gpu_stream = f"{stream_array_name}[{stream}]"
+                conn = f"{stream_var_name_prefix}{stream}"  # Note: Same as "gpu_stream_var_name" from tasklet
+
+                tasklet.add_in_connector(conn, dtypes.gpuStream_t)
+                tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True)
+                state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream))
+                state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream))
+
+    def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], 
+                                           stream_assignments: Dict[nodes.Node, int]) -> None:
+        """
+        Insert a GPU stream synchronization tasklet immediately after specified nodes.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The top level SDFG.
+        sync_node : Dict[nodes.Node, SDFGState]
+            Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur.
+        stream_assignments : Dict[nodes.Node, int]
+            Mapping of nodes to their assigned GPU stream IDs.
+        """
+        # Prepare GPU stream info and backend
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+        backend: str = common.get_gpu_backend()
+
+        for node, state in sync_node.items():
+
+            #----------------- Generate GPU stream synchronization Tasklet -----------------
+
+            # Get assigned GPU stream 
+            stream = stream_assignments.get(node, "nullptr")
+            if stream == "nullptr":
+                raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.")
+
+            # Create the tasklet
+            stream_var_name = f"{stream_var_name_prefix}{stream}"
+            sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n"
+            tasklet = state.add_tasklet( name=f"gpu_stream_{stream}_synchronization", 
+                                         inputs=set(), outputs=set(),
+                                         code=sync_call, language=dtypes.Language.CPP)
+
+
+            #----------------- Place tasklet between node and successors, link GPU streams ----------------
+
+            # 1. Put the tasklet between the node and its successors
+            for succ in state.successors(node):
+                state.add_edge(tasklet, None, succ, None, dace.Memlet())
+            state.add_edge(node, None, tasklet, None, dace.Memlet())
+            
+            # 2. Connect tasklet to GPU stream AccessNodes
+            in_stream = state.add_access(stream_array_name)
+            out_stream = state.add_access(stream_array_name)
+            accessed_stream = f"{stream_array_name}[{stream}]"
+            state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream))
+            state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream))
+            tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True)
+            tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True)
+            
\ No newline at end of file
diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py
new file mode 100644
index 0000000000..23bb4c7c94
--- /dev/null
+++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py
@@ -0,0 +1,70 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.codegen import common
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUStreamsToKernels(ppl.Pass):
+    """
+    This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).
+
+    Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
+    indicating which GPU stream each kernel is assigned to. These assignments are e.g.
+    used when launching the kernels.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream array name and the prefix for individual stream variables
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Link kernels to their assigned GPU streams
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a kernel entry - continue
+                    if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Assign the GPU stream to the kernel entry
+                    kernel_entry = node
+                    kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_in = state.add_access(stream_array_name)
+                    state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
+                                   dace.Memlet(accessed_gpu_stream))
+
+                    # Assign the GPU stream to the kernel exit
+                    kernel_exit = state.exit_node(kernel_entry)
+                    kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_out = state.add_access(stream_array_name)
+                    state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None,
+                                   dace.Memlet(accessed_gpu_stream))
+
+        return {}
diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py
new file mode 100644
index 0000000000..b55e4889a1
--- /dev/null
+++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py
@@ -0,0 +1,155 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import SDFG, dtypes, properties
+from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types
+from dace.config import Config
+from dace.sdfg import is_devicelevel_gpu
+from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, Node, Tasklet
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUStreamsToSDFGs(ppl.Pass):
+    """
+    Inserts a GPU stream array into the top-level SDFG and propagates it to all
+    nested SDFGs that require it, including intermediate SDFGs along the hierarchy.
+
+    This pass guarantees that every relevant SDFG has the array defined, avoiding
+    duplication and allowing subsequent passes in the GPU stream pipeline to rely
+    on its presence without redefining it.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        """
+        Ensure that a GPU stream array is available in all SDFGs that require it.
+
+        The pass creates the array once at the top-level SDFG and propagates it
+        down the hierarchy by inserting matching arrays in child SDFGs and wiring
+        them through nested SDFG connectors. This way, all SDFGs share a consistent
+        reference to the same GPU stream array.
+        """
+
+        # Extract stream array name and number of streams to allocate
+        stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+        stream_assignments: Dict[Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+        num_assigned_streams = max(stream_assignments.values(), default=0) + 1
+
+        # Add the GPU stream array at the top level
+        sdfg.add_transient(stream_array_name, (num_assigned_streams, ),
+                           dtype=dace.dtypes.gpuStream_t,
+                           storage=dace.dtypes.StorageType.Register)
+
+        # Ensure GPU stream array is defined where required
+        for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg):
+
+            # Skip if this child already has the array (inserted higher up in the hierarchy)
+            if stream_array_name in child_sdfg.arrays:
+                continue
+
+            # Add the array to the child SDFG
+            inner_sdfg = child_sdfg
+            inner_sdfg.add_array(stream_array_name, (num_assigned_streams, ),
+                                 dtype=dace.dtypes.gpuStream_t,
+                                 storage=dace.dtypes.StorageType.Register)
+
+            # Walk up the hierarchy until the array is found, inserting it into each parent
+            outer_sdfg = inner_sdfg.parent_sdfg
+            while stream_array_name not in outer_sdfg.arrays:
+
+                # Insert array in parent SDFG
+                outer_sdfg.add_array(stream_array_name, (num_assigned_streams, ),
+                                     dtype=dace.dtypes.gpuStream_t,
+                                     storage=dace.dtypes.StorageType.Register)
+
+                # Connect parent SDFG array to nested SDFG node
+                inner_nsdfg_node = inner_sdfg.parent_nsdfg_node
+                inner_parent_state = inner_sdfg.parent
+                inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t)
+                inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name)
+                inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name,
+                                            dace.Memlet(stream_array_name))
+
+                # Continue climbing up the hierarchy
+                inner_sdfg = outer_sdfg
+                outer_sdfg = outer_sdfg.parent_sdfg
+
+            # Ensure final connection from the first parent that had the array down to this SDFG
+            inner_nsdfg_node = inner_sdfg.parent_nsdfg_node
+            inner_parent_state = inner_sdfg.parent
+            inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t)
+            inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name)
+            inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name,
+                                        dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]"))
+
+            outer_sdfg = inner_sdfg.parent_sdfg
+
+        return {}
+
+    def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]:
+        """
+        Identify all child SDFGs that require a GPU stream array in their
+        array descriptor store. A child SDFG requires a GPU stream if:
+
+        - It launches GPU kernels (MapEntry/MapExit with GPU_Device schedule).
+        - It contains special Tasklets (e.g., from library node expansion) that
+          use the GPU stream they are assigned to in the code.
+        - It accesses GPU global memory outside device-level GPU scopes, which
+          implies memory copies or kernel data feeds.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The root SDFG to inspect.
+
+        Returns
+        -------
+        Set[SDFG]
+            The set of child SDFGs that need a GPU stream array in their array descriptor
+            store.
+        """
+        requiring_gpu_stream = set()
+        for child_sdfg in sdfg.all_sdfgs_recursive():
+
+            # Skip the root SDFG itself
+            if child_sdfg is sdfg:
+                continue
+
+            for state in child_sdfg.states():
+                for node in state.nodes():
+
+                    # Case 1: Kernel launch nodes
+                    if isinstance(node, (MapEntry, MapExit)) and node.map.schedule == dtypes.ScheduleType.GPU_Device:
+                        requiring_gpu_stream.add(child_sdfg)
+                        break
+
+                    # Case 2: Tasklets that use GPU stream in their code
+                    if isinstance(node, Tasklet) and STREAM_PLACEHOLDER in node.code.as_string:
+                        requiring_gpu_stream.add(child_sdfg)
+                        break
+
+                    # Case 3: Accessing GPU global memory outside device-level scopes
+                    if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Global
+                            and not is_devicelevel_gpu(state.sdfg, state, node)):
+                        requiring_gpu_stream.add(child_sdfg)
+                        break
+
+                # Stop scanning this SDFG once a reason is found
+                if child_sdfg in requiring_gpu_stream:
+                    break
+
+        return requiring_gpu_stream
diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py
new file mode 100644
index 0000000000..1438472da0
--- /dev/null
+++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py
@@ -0,0 +1,80 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels
+
+# Placeholder for the GPU stream variable used in tasklet code
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUStreamsToTasklets(ppl.Pass):
+    """
+    This pass ensures that tasklets which require access to their assigned GPU stream
+    are provided with it explicitly.
+
+    Such tasklets typically originate from expanded LibraryNodes targeting GPUs.
+    These nodes may reference the special placeholder variable `__dace_current_stream`,
+    which is expected to be defined during unparsing in `cpp.py`.
+
+    To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use
+    the GPU stream AccessNode directly.
+
+    Note that this pass is similar to `InsertGPUStreamsToKernels`.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream's array name
+        stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code
+        # and provide them the needed GPU stream explicitly
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a tasklet - continue
+                    if not isinstance(node, nodes.Tasklet):
+                        continue
+
+                    # Tasklet does not need use its assigned GPU stream - continue
+                    if not STREAM_PLACEHOLDER in node.code.as_string:
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_conn = STREAM_PLACEHOLDER
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Provide the GPU stream explicitly to the tasklet
+                    stream_array_in = state.add_access(stream_array_name)
+                    stream_array_out = state.add_access(stream_array_name)
+
+                    node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t)
+                    node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True)
+
+                    state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream))
+                    state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream))
+
+        return {}
diff --git a/dace/transformation/passes/insert_gpu_copy_tasklets.py b/dace/transformation/passes/insert_gpu_copy_tasklets.py
new file mode 100644
index 0000000000..447adc7767
--- /dev/null
+++ b/dace/transformation/passes/insert_gpu_copy_tasklets.py
@@ -0,0 +1,166 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import SDFG, SDFGState, dtypes, properties
+from dace import memlet as mm
+from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy
+from dace.config import Config
+from dace.sdfg import nodes, scope_contains_scope
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets
+from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets
+from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertGPUCopyTasklets(ppl.Pass):
+    """
+    This pass inserts explicit copy tasklets for data transfers that need to be handled
+    by the GPU and occur outside a kernel (for example, copying data from host memory
+    to the GPU before executing a kernel).
+
+    It identifies such copy locations and inserts the corresponding tasklets. For each
+    memlet path describing a copy, the first edge is duplicated: one edge goes from the original
+    source to the tasklet, and the other from the tasklet to the original destination, while
+    the original edge is removed.
+
+    This is experimental and could later serve as inspiration for making all copies explicit.
+    Considerations for future work include allowing tasklets to access array addresses
+    from connectors and describing in memlets how data will be moved, since currently
+    tasklets only support value inputs.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        depending_passes = {
+            NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets,
+            InsertGPUStreamSyncTasklets
+        }
+        return depending_passes
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Tasklets | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
+        """
+        Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling.
+        Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel
+        function.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to transform by adding out-of-kernel GPU copy tasklets.
+        pipeline_results : Dict[str, Any]
+            Results from previous transformation passes, including GPU stream assignments.
+
+        Returns
+        -------
+        dict
+            Currently returns an empty dictionary.
+        """
+        # Prepare GPU stream
+        gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+        gpustream_array_name, gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+
+        # Initialize the strategy for copies that occur outside of kernel execution
+        out_of_kernel_copy = OutOfKernelCopyStrategy()
+
+        # Get all data copies to process the out of kernel copies
+        copy_worklist = self.find_all_data_copies(sdfg)
+
+        for copy_sdfg, state, src_node, dst_node, edge in copy_worklist:
+
+            copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments)
+
+            # Only insert copy tasklets for GPU related copies occuring out of the
+            # kernel (i.e. a GPU_device scheduled map)
+            if not out_of_kernel_copy.applicable(copy_context):
+                continue
+
+            # Generatae the copy call
+            code = out_of_kernel_copy.generate_copy(copy_context)
+
+            # Prepare GPU ustream connectors and the stream to be accessed from the
+            # GPU stream array
+            gpustream_id = gpustream_assignments[dst_node]
+            gpustream_var_name = f"{gpustream_var_name_prefix}{gpustream_id}"
+            accessed_gpustream = f"{gpustream_array_name}[{gpustream_id}]"
+
+            # Create the tasklet and add GPU stream related connectors
+            tasklet = state.add_tasklet("gpu_copy", {}, {}, code, language=dtypes.Language.CPP)
+            tasklet.add_in_connector(gpustream_var_name, dtypes.gpuStream_t, True)
+            tasklet.add_out_connector(gpustream_var_name, dtypes.gpuStream_t, True)
+
+            # Add incoming and outgoing GPU stream accessNodes to the tasklet
+            in_gpustream = state.add_access(gpustream_array_name)
+            out_gpustream = state.add_access(gpustream_array_name)
+            state.add_edge(in_gpustream, None, tasklet, gpustream_var_name, dace.Memlet(accessed_gpustream))
+            state.add_edge(tasklet, gpustream_var_name, out_gpustream, None, dace.Memlet(accessed_gpustream))
+
+            # Put the tasklet in between the edge
+            dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge
+            state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet))
+            state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet))
+            state.remove_edge(edge)
+
+        return {}
+
+    def find_all_data_copies(
+            self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]:
+        """
+        Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node,
+        destination node, and the first memlet edge of in the memlet path between source and destination node.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to analyze for potential data copies.
+
+        Returns
+        -------
+        List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]
+            A list of tuples representing the data copy, each containing:
+            - The SDFG containing the copy
+            - The state in which the copy occurs
+            - The source node of the copy
+            - The destination node of the copy
+            - The first memlet edge representing the data movement
+        """
+        copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = []
+        visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set()
+
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+            for state in sub_sdfg.states():
+                for edge in state.edges():
+
+                    # Skip edges that were already processed
+                    if edge in visited_edges:
+                        continue
+
+                    # Get the memlet path and mark all edges in the path as visited
+                    memlet_path = state.memlet_path(edge)
+                    visited_edges.update(set(memlet_path))
+
+                    # Get source and destination noces
+                    first_edge = memlet_path[0]
+                    last_edge = memlet_path[-1]
+                    src_node = first_edge.src
+                    dst_node = last_edge.dst
+
+                    # Skip empty memlets
+                    if first_edge.data.subset is None:
+                        continue
+
+                    # Add copy to the worklist
+                    copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge))
+
+        return copy_worklist
diff --git a/dace/transformation/passes/move_array_out_of_kernel.py b/dace/transformation/passes/move_array_out_of_kernel.py
new file mode 100644
index 0000000000..bd7e401187
--- /dev/null
+++ b/dace/transformation/passes/move_array_out_of_kernel.py
@@ -0,0 +1,901 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Dict, FrozenSet, Set, Tuple, List, Optional
+import copy
+import functools
+from collections import deque
+
+import sympy
+
+import dace
+from dace import SDFG, SDFGState, dtypes, data as dt
+from dace.sdfg import nodes
+from dace.properties import make_properties
+from dace.transformation import transformation, helpers
+from dace.transformation.pass_pipeline import Pass
+from dace.subsets import Range
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.memlet import Memlet
+from dace.symbolic import symbol
+
+import dace.sdfg.utils as sdutil
+
+
+@make_properties
+@transformation.explicit_cf_compatible
+class MoveArrayOutOfKernel(Pass):
+    """
+    This pass supports a legacy use case in the 'ExperimentalCUDACodeGen' backend: the use of
+    transient arrays with dtypes.StorageType.GPU_Global inside GPU_Device scheduled maps (kernels).
+    Previously, the old 'CUDACodeGen' moved such arrays outside the kernel during codegen, which caused:
+
+    1. Mismatches between the SDFG and the generated code,
+    2. Complex, misplaced logic in codegen,
+    3. Incorrect semantics — a single shared array was reused instead of per-iteration replication,
+       leading to race conditions.
+
+    This pass fixes these issues by explicitly lifting such arrays out of GPU_Device maps
+    and creating disjoint arrays per map iteration. Unlike the legacy approach, the transformation
+    is now visible and consistent at the SDFG level, avoiding naming collisions and improving clarity.
+
+    NOTE: There is no true "local device (GPU_Device) memory" on GPUs, but DaCe supports this
+    pattern for legacy reasons. This pass exists purely for backward compatibility, and its use
+    is strongly discouraged.
+    """
+
+    def __init__(self):
+        """
+        Initializes caches for mapping nodes to their states and SDFGs.
+
+        This avoids repeatedly traversing the SDFG structure during the pass.
+        The caches are populated in `apply_pass` for convenience.
+        """
+        self._node_to_state_cache: Dict[nodes.Node, SDFGState] = dict()
+        self._node_to_sdfg_cache: Dict[nodes.Node, SDFG] = dict()
+
+    # Entry point
+    def apply_pass(self, root_sdfg: SDFG, kernel_entry: nodes.MapEntry, array_name: str) -> None:
+        """
+        Applies the pass to move a transient GPU_Global array out of a GPU_Device map.
+
+        Args:
+            root_sdfg: The top-level SDFG to operate on.
+            kernel_entry: The MapEntry node representing the GPU_Device scheduled map (i.e., the kernel)
+                        that contains the transient array.
+            array_name: The name of the transient array to move. Note that multiple arrays with the
+                        same name may exist within the kernel. All will be lifted.
+        """
+        # Cache every nodes parent state and parent sdfg
+        for node, parent in root_sdfg.all_nodes_recursive():
+            if isinstance(node, nodes.Node):
+                assert isinstance(parent, SDFGState)
+                self._node_to_state_cache[node] = parent
+                self._node_to_sdfg_cache[node] = parent.sdfg
+
+        # Check if all access nodes to 'array_name' within the kernel are defined in the same SDFG as the map
+        kernel_parent_sdfg = self._node_to_sdfg_cache[kernel_entry]
+        simple_case = True
+        for (_, outermost_sdfg, _, _) in self.collect_array_descriptor_usage(kernel_entry, array_name):
+            if outermost_sdfg != kernel_parent_sdfg:
+                simple_case = False
+                break
+
+        if simple_case:
+            # All access nodes are in the same SDFG as the kernel map - easy
+            access_nodes = [an for an, _, _ in self.get_access_nodes_within_map(kernel_entry, array_name)]
+            self.move_array_out_of_kernel_flat(kernel_entry, array_name, access_nodes)
+        else:
+            # Access nodes span nested maps or SDFGs —  more involved (more checks, naming conflicts, several seperate
+            # array descriptors with the same array_name)
+            self.move_array_out_of_kernel_nested(kernel_entry, array_name)
+
+    # Main transformation algorithms and helpers
+    def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name: str,
+                                      access_nodes: List[nodes.AccessNode]) -> None:
+        """
+        Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the flat case.
+
+        This function handles the simpler case where all access nodes to the array are in the same
+        SDFG and state as the kernel map. Therefore, there are no nested SDFGs or naming conflicts
+        (since an SDFG cannot define multiple descriptors with the same name).
+
+        The array is reshaped to allocate a disjoint slice per map iteration. For example, given:
+
+            for x, y in dace.map[0:128, 0:32] @ GPU_Device:
+                gpu_A = dace.define_local([64], dtype, storage=GPU_Global)
+
+        the array shape will be updated to [128, 32, 64], and memlets will ensure each thread
+        accesses [x, y, 0:64].
+
+        Additionally, this method inserts the necessary access nodes and edges to correctly move
+        the array out of the map scope and maintain correctness.
+
+        Args:
+            kernel_entry: The MapEntry node representing the GPU kernel.
+            array_name: Name of the transient array to move.
+            access_nodes: List of access nodes referring to the array inside the map.
+        """
+        # A closest AccessNode of kernel exit is used
+        parent_state = self._node_to_state_cache[kernel_entry]
+        kernel_exit: nodes.MapExit = parent_state.exit_node(kernel_entry)
+        closest_an = self.get_nearest_access_node(access_nodes, kernel_exit)
+        array_desc = closest_an.desc(parent_state)
+
+        # Get the chain of MapEntries from the AccessNode up to and including the kernel map entry
+        map_entry_chain, _ = self.get_maps_between(kernel_entry, closest_an)
+
+        # Store the original full-range subset of the array.
+        # Needed to define correct memlets when moving the array out of the kernel.
+        old_subset = [(0, dim - 1, 1) for dim in array_desc.shape]
+
+        # Update the array
+        new_shape, new_strides, new_total_size, new_offsets = self.get_new_shape_info(array_desc, map_entry_chain)
+        array_desc.set_shape(new_shape=new_shape, strides=new_strides, total_size=new_total_size, offset=new_offsets)
+
+        # Update all memlets
+        self.update_memlets(kernel_entry, array_name, closest_an, access_nodes)
+
+        # add new edges to move access Node out of map
+        in_connector: str = 'IN_' + array_name
+        out_connector: str = 'OUT_' + array_name
+        previous_node = closest_an
+        previous_out_connector = None
+        for next_map_entry in map_entry_chain:
+
+            next_map_exit = parent_state.exit_node(next_map_entry)
+            if in_connector not in next_map_exit.in_connectors:
+                next_map_state = self._node_to_state_cache[next_map_exit]
+                next_map_exit.add_in_connector(in_connector)
+                next_map_exit.add_out_connector(out_connector)
+
+                next_entries, _ = self.get_maps_between(kernel_entry, previous_node)
+
+                next_map_state.add_edge(previous_node, previous_out_connector, next_map_exit, in_connector,
+                                        Memlet.from_array(array_name, array_desc))
+
+            previous_node = next_map_exit
+            previous_out_connector = out_connector
+
+        # New Access Node outside of the target map, connected to the exit
+        access_node_outside = parent_state.add_access(array_name)
+        parent_state.add_edge(kernel_exit, out_connector, access_node_outside, None,
+                              Memlet.from_array(array_name, array_desc))
+
+    def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_name: str) -> None:
+        """
+        Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the nested case.
+
+        This function handles the more complex scenario where access nodes to the array may be
+        defined inside nested SDFGs within the kernel's parent SDFG. It moves the array out of
+        all nested maps and SDFGs, updating shapes and memlets accordingly, and resolves naming
+        conflicts that arise from multiple descriptors with the same name in different scopes
+        (by renaming).
+
+        The method also ensures that the array is correctly lifted through all nested SDFGs
+        between its original definition and the kernel map, updating symbols and connectors
+        along the way.
+
+        Args:
+            kernel_entry: The MapEntry node representing the GPU kernel.
+            array_name: Name of the transient array to move.
+        """
+        # Collect all information about every distinct data descriptor with the same name "array_name"
+        array_descriptor_usage = self.collect_array_descriptor_usage(kernel_entry, array_name)
+        original_array_name = array_name
+        kernel_parent_sdfg = self._node_to_sdfg_cache[kernel_entry]
+
+        for array_desc, outermost_sdfg, sdfg_defined, access_nodes in array_descriptor_usage:
+
+            if outermost_sdfg == kernel_parent_sdfg:
+                # Special case: There are nested accesss nodes, but their descriptor is defined at
+                # the same sdfg as the kernel. Thus, we can use the simpler algorithm.
+                self.move_array_out_of_kernel_flat(kernel_entry, original_array_name, list(access_nodes))
+                continue
+
+            # The outermost node
+            nsdfg_node = outermost_sdfg.parent_nsdfg_node
+            map_entry_chain, _ = self.get_maps_between(kernel_entry, nsdfg_node)
+
+            # Store the original full-range subset of the array.
+            # Needed to define correct memlets when moving the array out of the kernel.
+            old_subset = [(0, dim - 1, 1) for dim in array_desc.shape]
+
+            # Update array_descriptor
+            new_shape, new_strides, new_total_size, new_offsets = self.get_new_shape_info(array_desc, map_entry_chain)
+            array_desc.set_shape(new_shape=new_shape,
+                                 strides=new_strides,
+                                 total_size=new_total_size,
+                                 offset=new_offsets)
+            array_desc.transient = False
+
+            # Update memlets data movement
+            self.update_memlets(kernel_entry, original_array_name, nsdfg_node, access_nodes)
+
+            # Update name if names conflict
+            required, array_name = self.new_name_required(kernel_entry, original_array_name, sdfg_defined)
+            if required:
+                self.replace_array_name(sdfg_defined, original_array_name, array_name, array_desc)
+
+            # Ensure required symbols are defined
+            self.update_symbols(map_entry_chain, kernel_parent_sdfg)
+
+            # Collect all SDFGs from the outermost definition to the target map's parent (inclusive)
+            sdfg_hierarchy: List[SDFG] = [outermost_sdfg]
+            current_sdfg = outermost_sdfg
+            while current_sdfg != kernel_parent_sdfg:
+                current_sdfg = current_sdfg.parent_sdfg
+                sdfg_hierarchy.append(current_sdfg)
+
+            # Validate collected SDFGs: no None entries
+            if any(sdfg is None for sdfg in sdfg_hierarchy):
+                raise ValueError("Invalid SDFG hierarchy: contains 'None' entries. This should not happen.")
+
+            # Validate depth: must include at least outer + target SDFG
+            if len(sdfg_hierarchy) < 2:
+                raise ValueError(f"Invalid SDFG hierarchy: only one SDFG found. "
+                                 f"Expected at least two levels, since {outermost_sdfg} is not equal to "
+                                 "the kernel map's SDFG and is contained within it — the last entry should "
+                                 "be the kernel's parent SDFG.")
+
+            self.lift_array_through_nested_sdfgs(array_name, kernel_entry, sdfg_hierarchy, old_subset)
+
+    def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.MapEntry, sdfg_hierarchy: List[SDFG],
+                                        old_subset: List) -> None:
+        """
+        Lifts a transient array through nested SDFGs.
+
+        For each SDFG in the hierarchy (from inner to outer), this deepcopies the array descriptor
+        and adds edges from the NestedSDFG node through any enclosing maps to a new access node.
+        This is done until the kernel is exited.
+        Memlets are updated using `old_subset` and enclosing map parameters.
+
+        Args:
+            array_name: Name of the array to lift.
+            kernel_entry: Innermost GPU kernel MapEntry.
+            sdfg_hierarchy: Ordered list of nested SDFGs (inner to outer).
+            old_subset: Inner array subset used for memlet construction.
+        """
+        # Move array out ouf the kernel map entry through nested SDFGs
+        outer_sdfg = sdfg_hierarchy.pop(0)
+        while sdfg_hierarchy:
+            inner_sdfg = outer_sdfg
+            outer_sdfg = sdfg_hierarchy.pop(0)
+            nsdfg_node = inner_sdfg.parent_nsdfg_node
+            nsdfg_parent_state = self._node_to_state_cache[nsdfg_node]
+
+            # copy and add the descriptor to the outer sdfg
+            old_desc = inner_sdfg.arrays[array_name]
+            new_desc = copy.deepcopy(old_desc)
+            outer_sdfg.add_datadesc(array_name, new_desc)
+
+            # Get all parent scopes to detect how the data needs to flow.
+            # E.g. nsdfg_node -> MapExit  needs to be nsdfg_node -> MapExit -> AccessNode (new)
+            parent_scopes: List[nodes.MapEntry] = []
+            current_parent_scope = nsdfg_node
+            scope_dict = nsdfg_parent_state.scope_dict()
+            while scope_dict[current_parent_scope] is not None and current_parent_scope is not kernel_entry:
+                parent_scopes.append(scope_dict[current_parent_scope])
+                current_parent_scope = scope_dict[current_parent_scope]
+
+            # Get a new AccessNode where the nsdfg node's parent state is.
+            # Note: This is in the OUTER sdfg, so this is the first accessNode accessing
+            # the current array descriptor
+            exit_access_node = nsdfg_parent_state.add_access(array_name)
+
+            # Cache its location
+            self._node_to_state_cache[exit_access_node] = nsdfg_parent_state
+            self._node_to_sdfg_cache[exit_access_node] = outer_sdfg
+
+            # Create a dataflow path from the NestedSDFG node to the new exit access node,
+            # passing through any enclosing map scopes (if the NestedSDFG is nested within maps).
+            src = nsdfg_node
+            for scope_entry in parent_scopes:
+                # next destination is the scope exit
+                scope_exit = nsdfg_parent_state.exit_node(scope_entry)
+                dst = scope_exit
+
+                # Next, add edge between src and dst in 2 steps:
+                # 1.1 Determine source connector name and register it based on src type
+                if isinstance(src, nodes.NestedSDFG):
+                    src_conn = array_name
+                    src.add_out_connector(src_conn)
+                elif isinstance(src, nodes.MapExit):
+                    src_conn = f"OUT_{array_name}"
+                    src.add_out_connector(src_conn)
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected."
+                    )
+
+                # 1.2 Determine destination connector name and register it based on dst type
+                if isinstance(dst, nodes.AccessNode):
+                    dst_conn = None  # AccessNodes use implicit connectors
+                elif isinstance(dst, nodes.MapExit):  # Assuming dst is the entry for parent scope
+                    dst_conn = f"IN_{array_name}"
+                    dst.add_in_connector(dst_conn)
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported destination node type '{type(dst).__name__}' — expected AccessNode or MapEntry.")
+
+                # 2. Add the edge using the connector names determined in Step 1.
+                next_entries, _ = self.get_maps_between(kernel_entry, src)
+                memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset)
+                nsdfg_parent_state.add_edge(src, src_conn, dst, dst_conn, Memlet.from_array(array_name, new_desc))
+
+                # Continue by setting the dst as source
+                src = dst
+
+            # After processing all scopes, the last src (which is either the last MapExit or the intial nsdfg if there are no parent scope)
+            # needs to be connected to the exit access node added before
+            dst = exit_access_node
+
+            if isinstance(src, nodes.NestedSDFG):
+                src_conn = array_name
+                src.add_out_connector(src_conn)
+            elif isinstance(src, nodes.MapExit):
+                src_conn = f"OUT_{array_name}"
+                src.add_out_connector(src_conn)
+            else:
+                raise NotImplementedError(
+                    f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.")
+
+            next_entries, _ = self.get_maps_between(kernel_entry, src)
+            memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset)
+            nsdfg_parent_state.add_edge(src, src_conn, dst, None, Memlet.from_array(array_name, new_desc))
+
+        # At the outermost sdfg we set the array descriptor to be transient again,
+        # Since it is not needed beyond it. Furthermore, this ensures that the codegen
+        # allocates the array and does not expect it as input to the kernel
+        new_desc.transient = True
+
+    # Memlet related helper functions
+    def get_memlet_subset(self, map_chain: List[nodes.MapEntry], node: nodes.Node):
+        """
+        Compute the memlet subset to access an array based on the position of a node within nested GPU maps.
+
+        For each GPU_Device or GPU_ThreadBlock map in the chain:
+        - If the node lies inside the map (but is not the map entry or exit itself),
+          the subset is the single index corresponding to the map parameter (symbolic).
+        - Otherwise, the full range of the map dimension is used.
+
+        This ensures that memlets correctly represent per-thread or per-block slices
+        when moving arrays out of kernel scopes.
+
+        Args:
+            map_chain: List of MapEntry nodes representing nested maps from outermost to innermost.
+            node: The node for which to determine the subset (could be an access node or map entry/exit).
+
+        Returns:
+            A list of subsets (start, end, stride) tuples for each map dimension.
+        """
+        subset = []
+        for next_map in map_chain:
+            if not next_map.map.schedule in [dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock]:
+                continue
+
+            map_parent_state = self._node_to_state_cache[next_map]
+            for param, (start, end, stride) in zip(next_map.map.params, next_map.map.range.ndrange()):
+
+                node_is_map = ((isinstance(node, nodes.MapEntry) and node == next_map)
+                               or (isinstance(node, nodes.MapExit) and map_parent_state.exit_node(next_map) == node))
+                node_state = self._node_to_state_cache[node]
+                if helpers.contained_in(node_state, node, next_map) and not node_is_map:
+                    index = symbol(param)
+                    subset.append((index, index, 1))
+                else:
+                    subset.append((start, end, stride))
+
+        return subset
+
+    def update_memlets(self, kernel_entry: nodes.MapEntry, array_name: str, outermost_node: nodes.Node,
+                       access_nodes: Set[nodes.AccessNode]) -> None:
+        """
+        Updates all memlets related to a given transient array to reflect correct data
+        movement when moving array out of the kernel entry.
+
+        Any map enclosing the `outermost_node` also encloses all access nodes and is
+        used to determine which maps are strictly above the access nodes. Based on this,
+        we compute the correct memlet subset that includes the additional dimensions
+        from the GPU map hierarchy.
+
+        Args:
+            kernel_entry: The MapEntry node representing the GPU kernel scope.
+            array_name: Name of the transient array being moved out.
+            outermost_node: The outermost node.
+            access_nodes: Set of AccessNodes inside the kernel that reference the same array.
+        """
+        map_entry_chain, _ = self.get_maps_between(kernel_entry, outermost_node)
+        params_as_ranges = self.get_memlet_subset(map_entry_chain, outermost_node)
+
+        # Update in and out path memlets
+        visited: Set[MultiConnectorEdge[Memlet]] = set()
+        for access_node in access_nodes:
+            # in paths
+            for path in self.in_paths(access_node):
+                for edge in path:
+
+                    # Guards
+                    if edge in visited:
+                        continue
+
+                    if edge.data.data == array_name:
+                        old_range = edge.data.subset.ndrange()
+                        new_range = params_as_ranges + old_range
+                        edge.data.subset = Range(new_range)
+                        visited.add(edge)
+
+                    elif edge.data.data != array_name and edge.dst is access_node and edge.data.dst_subset is not None:
+                        old_range = edge.data.dst_subset.ndrange()
+                        new_range = params_as_ranges + old_range
+                        edge.data.dst_subset = Range(new_range)
+                        visited.add(edge)
+
+                    else:
+                        continue
+
+            # out paths
+            for path in self.out_paths(access_node):
+                for edge in path:
+                    if edge in visited:
+                        continue
+
+                    if edge.data.data == array_name:
+                        old_range = edge.data.subset.ndrange()
+                        new_range = params_as_ranges + old_range
+                        edge.data.subset = Range(new_range)
+                        visited.add(edge)
+
+                    elif (edge.data.data
+                          != array_name) and edge.src is access_node and edge.data.src_subset is not None:
+                        old_range = edge.data.src_subset.ndrange()
+                        new_range = params_as_ranges + old_range
+                        edge.data.src_subset = Range(new_range)
+                        visited.add(edge)
+
+                    else:
+                        continue
+
+    # Array, symbol and renaming related helper functions
+    def get_new_shape_info(self, array_desc: dt.Array, map_exit_chain: List[nodes.MapEntry]):
+        """
+        Calculate the new shape, strides, total size, and offsets for a transient array
+        when moving it out of a GPU_Device kernel.
+
+        Each GPU_Device map adds dimensions to allocate disjoint slices per thread.
+
+        For example:
+
+            for x, y in dace.map[0:128, 0:32] @ GPU_Device:
+                gpu_A = dace.define_local([64], dtype, storage=GPU_Global)
+
+        gpu_A's shape changes from [64] to [128, 32, 64] to give each thread its own slice
+        (i.e. gpu_A[x, y, 64]).
+
+        Args:
+            array_desc: Original array descriptor.
+            map_exit_chain: List of MapEntry nodes between array and kernel exit.
+
+        Returns:
+            Tuple (new_shape, new_strides, new_total_size, new_offsets) for the updated array.
+        """
+        extended_size = []
+        new_strides = list(array_desc.strides)
+        new_offsets = list(array_desc.offset)
+        for next_map in map_exit_chain:
+            if not next_map.map.schedule in [dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock]:
+                continue
+
+            map_range: Range = next_map.map.range
+            max_elements = map_range.max_element()
+            min_elements = map_range.min_element()
+            range_size = [max_elem + 1 - min_elem for max_elem, min_elem in zip(max_elements, min_elements)]
+
+            #TODO: check this / clean (maybe support packed C and packed Fortran layouts separately for code readability future)
+            old_total_size = array_desc.total_size
+            accumulator = old_total_size
+            new_strides.insert(0, old_total_size)
+            for cur_range_size in range_size[:-1]:
+                new_strides.insert(0, accumulator)  # insert before (mult with volumes)
+                accumulator = accumulator * cur_range_size
+
+            extended_size = range_size + extended_size
+            #new_strides = [1 for _ in next_map.map.params] + new_strides  # add 1 per dimension
+            new_offsets = [0 for _ in next_map.map.params] + new_offsets  # add 0 per dimension
+
+        new_shape = extended_size + list(array_desc.shape)
+        new_total_size = functools.reduce(sympy.Mul, extended_size, 1) * array_desc.total_size
+
+        return new_shape, new_strides, new_total_size, new_offsets
+
+    # TODO: Ask Yakup -> No states test but this should be alright
+    def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: str, array_desc: dt.Array) -> None:
+        """
+        Replaces all occurrences of an array name in the given SDFGs, including its data descriptor,
+        memlets, connectors and access nodes with a new name.
+
+        Args:
+            sdfgs (Set[SDFG]): The SDFGs in which to perform the renaming.
+            old_name (str): The original array name to be replaced.
+            new_name (str): The new array name.
+            new_descriptor (dt.Array): The data descriptor associated with the old and new name.
+        """
+        for sdfg in sdfgs:
+
+            # Replace by removing the data descriptor and adding it with the new name
+            sdfg.remove_data(old_name, False)
+            sdfg.add_datadesc(new_name, array_desc)
+            sdfg.replace(old_name, new_name)
+
+            # Find all states
+            for state in sdfg.states():
+                for edge in state.edges():
+
+                    # Update out connectors
+                    src = edge.src
+                    old_out_conn = f"OUT_{old_name}"
+                    new_out_conn = f"OUT_{new_name}"
+                    if edge.src_conn == old_out_conn:
+                        edge.src_conn = new_out_conn
+                        src.remove_out_connector(old_out_conn)
+                        src.add_out_connector(new_out_conn)
+
+                    # Update in connectors
+                    dst = edge.dst
+                    old_in_conn = f"IN_{old_name}"
+                    new_in_conn = f"IN_{new_name}"
+                    if edge.dst_conn == old_in_conn:
+                        edge.dst_conn = new_in_conn
+                        dst.remove_in_connector(old_in_conn)
+                        dst.add_in_connector(new_in_conn)
+
+    def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) -> None:
+        """
+        Ensures symbols from GPU maps are defined in all nested SDFGs.
+
+        When lifting arrays out of GPU maps, any used symbols (e.g., map indices)
+        must be available in nested SDFGs for correct memlet updates.
+        This function collects such symbols from the map scopes and adds them to
+        the symbol tables and mappings of all nested SDFGs under `top_sdfg`.
+
+        Args:
+            map_entry_chain: List of GPU MapEntry nodes whose symbols are relevant.
+            top_sdfg: The top-level SDFG under which symbols will be propagated.
+        """
+        all_symbols = set()
+        for next_map in map_entry_chain:
+            if not next_map.map.schedule in [
+                    dace.dtypes.ScheduleType.GPU_Device, dace.dtypes.ScheduleType.GPU_ThreadBlock
+            ]:
+                continue
+            all_symbols = all_symbols | next_map.used_symbols_within_scope(self._node_to_state_cache[next_map])
+
+        for sdfg in top_sdfg.all_sdfgs_recursive():
+            nsdfg_node = sdfg.parent_nsdfg_node
+            if nsdfg_node is None:
+                continue
+
+            for symbol in all_symbols:
+                if str(symbol) not in sdfg.symbols:
+                    sdfg.add_symbol(str(symbol), dace.dtypes.int32)
+                if str(symbol) not in nsdfg_node.symbol_mapping:
+                    nsdfg_node.symbol_mapping[symbol] = dace.symbol(symbol)
+
+    # Array analysis and metadata functions
+    def collect_array_descriptor_usage(
+            self, map_entry: nodes.MapEntry,
+            array_name: str) -> Set[Tuple[dt.Array, SDFG, FrozenSet[SDFG], FrozenSet[nodes.AccessNode]]]:
+        """
+        Tracks usage of a transient array across nested SDFGs within the scope of a map.
+
+        For each array it collects:
+        - the outermost SDFG where it is defined or passed through,
+        - all SDFGs in which it is accessed or passed via connectors,
+        - all AccessNodes referencing it in those SDFGs.
+
+        Note: By "same array" we mean arrays with the same name and connected via memlets;
+        multiple descriptor objects (dt.Array) may exist across SDFGs for the same logical array.
+
+        Args:
+            map_entry: The MapEntry node whose scope is used for analysis.
+            array_name: The name of the array to analyze.
+
+        Returns:
+            A set of tuples, each containing:
+                - one of potentially many dt.Array descriptors,
+                - the outermost defining or using SDFG,
+                - a frozenset of all involved SDFGs,
+                - a frozenset of all AccessNodes using this array.
+        """
+        access_nodes_info: List[Tuple[nodes.AccessNode, SDFGState,
+                                      SDFG]] = self.get_access_nodes_within_map(map_entry, array_name)
+
+        last_sdfg: SDFG = self._node_to_sdfg_cache[map_entry]
+
+        result: Set[Tuple[dt.Array, SDFG, Set[SDFG], Set[nodes.AccessNode]]] = set()
+        visited_sdfgs: Set[SDFG] = set()
+
+        for access_node, state, sdfg in access_nodes_info:
+
+            # Skip visited sdfgs where the array name is defined
+            if sdfg in visited_sdfgs:
+                continue
+
+            # Get the array_desc (there may be several copies across SDFG, but
+            # we are only interested in the information thus this is fine)
+            array_desc = access_node.desc(state)
+
+            # Collect all sdfgs and access nodes which refer to the same array
+            # (we determine this by inspecting if the array name is passed via connectors)
+            sdfg_set: Set[SDFG] = set()
+            access_nodes_set: Set[nodes.AccessNode] = set()
+            access_nodes_set.add(access_node)
+
+            # Get all parent SDFGs and the outermost sdfg where defined
+            current_sdfg = sdfg
+            outermost_sdfg = current_sdfg
+            while True:
+                sdfg_set.add(current_sdfg)
+
+                # We have reached the map's sdfg, so this is the
+                # outermost_sdfg we consider
+                if current_sdfg == last_sdfg:
+                    outermost_sdfg = current_sdfg
+                    break
+
+                nsdfg_node = current_sdfg.parent_nsdfg_node
+                if array_name in nsdfg_node.in_connectors or array_name in nsdfg_node.out_connectors:
+                    current_sdfg = current_sdfg.parent_sdfg
+                    outermost_sdfg = current_sdfg
+                else:
+                    break
+
+            # Get all child SDFGs where the array was also passed to
+            queue = [sdfg]
+            while queue:
+                current_sdfg = queue.pop(0)
+                for child_state in current_sdfg.states():
+                    for node in child_state.nodes():
+                        if not isinstance(node, nodes.NestedSDFG):
+                            continue
+
+                        nsdfg_node = node
+                        if array_name in nsdfg_node.in_connectors or array_name in nsdfg_node.out_connectors:
+                            queue.append(nsdfg_node.sdfg)
+                            sdfg_set.add(nsdfg_node.sdfg)
+
+            # Get all access nodes with the array name used in the sdfgs we found
+            for current_sdfg in sdfg_set:
+                for current_state in current_sdfg.states():
+                    for node in current_state.nodes():
+                        if isinstance(node, nodes.AccessNode) and node.data == array_name:
+                            access_nodes_set.add(node)
+
+            # Update all visited sdfgs
+            visited_sdfgs.update(sdfg_set)
+
+            # Finally add information to the result
+            result.add((array_desc, outermost_sdfg, frozenset(sdfg_set), frozenset(access_nodes_set)))
+
+        return result
+
+    def new_name_required(self, map_entry: nodes.MapEntry, array_name: str,
+                          sdfg_defined: FrozenSet[SDFG]) -> Tuple[bool, str]:
+        """
+        Returns whether the array_name is also used at an SDFG which is not in the sdfg_defined set.
+        This means that the array_name at that SDFG refers to another data descriptor.
+        Another new name is suggested if this case occurs.
+
+        Args:
+            map_entry: The MapEntry node whose scope is used to determine name usage.
+            array_name: The name of the data descriptor of interest
+            sdfg_defined: where the data descriptor is defined
+
+        Returns:
+            A Tuple where first element is indicatin whether a new name is required, and
+            the other is either the same name if no new name is required or otherwise a new name suggestion.
+        """
+        map_parent_sdfg = self._node_to_sdfg_cache[map_entry]
+        taken_names = set()
+
+        for sdfg in map_parent_sdfg.all_sdfgs_recursive():
+
+            # Continue if sdfg is neither the map's parent state
+            # or not contained within the map scope
+            nsdfg_node = sdfg.parent_nsdfg_node
+            state = self._node_to_state_cache[nsdfg_node] if nsdfg_node else None
+
+            if not ((nsdfg_node and state and helpers.contained_in(state, nsdfg_node, map_entry))
+                    or sdfg is map_parent_sdfg):
+                continue
+
+            # Taken names are all symbol and array identifiers of sdfgs in which
+            # the array_name's data descriptor we are interested in IS NOT defined
+            if sdfg not in sdfg_defined:
+                taken_names.update(sdfg.arrays.keys())
+                taken_names.update(sdfg.used_symbols(True))
+
+        if array_name in taken_names:
+            counter = 0
+            new_name = f"local_{counter}_{array_name}"
+            while new_name in taken_names:
+                counter += 1
+                new_name = f"local_{counter}_{array_name}"
+
+            return True, new_name
+        else:
+            return False, array_name
+
+    # Utility functions - basic building blocks
+    def get_access_nodes_within_map(self, map_entry: nodes.MapEntry,
+                                    data_name: str) -> List[Tuple[nodes.AccessNode, SDFGState, SDFG]]:
+        """
+        Finds all AccessNodes that refer to the given `data_name` and are located inside
+        the scope of the specified MapEntry.
+
+        Returns:
+            A list of tuples, each consisting of:
+                - the matching AccessNode,
+                - the SDFGState in which it resides,
+                - and the parent SDFG containing the node.
+        """
+        starting_sdfg = self._node_to_sdfg_cache[map_entry]
+        matching_access_nodes = []
+
+        for node, parent_state in starting_sdfg.all_nodes_recursive():
+
+            if (isinstance(node, nodes.AccessNode) and node.data == data_name
+                    and helpers.contained_in(parent_state, node, map_entry)):
+
+                parent_sdfg = self._node_to_sdfg_cache[node]
+                matching_access_nodes.append((node, parent_state, parent_sdfg))
+
+        return matching_access_nodes
+
+    def get_maps_between(self, stop_map_entry: nodes.MapEntry,
+                         node: nodes.Node) -> Tuple[List[nodes.MapEntry], List[nodes.MapExit]]:
+        """
+        Returns all MapEntry/MapExit pairs between `node` and `stop_map_entry`, inclusive.
+
+        Maps are returned from innermost to outermost, starting at the scope of `node` and
+        ending at `stop_map_entry`. Assumes that `node` is (directly or indirectly via a
+        nestedSDFG) contained within the `stop_map_entry`'s scope.
+
+        Args:
+            stop_map_entry: The outermost MapEntry to stop at (inclusive).
+            node: The node from which to begin scope traversal.
+
+        Returns:
+            A tuple of two lists:
+                - List of MapEntry nodes (from inner to outer scope),
+                - List of corresponding MapExit nodes.
+        """
+        stop_state = self._node_to_state_cache[stop_map_entry]
+        stop_exit = stop_state.exit_node(stop_map_entry)
+
+        entries: List[nodes.MapEntry] = []
+        exits: List[nodes.MapExit] = []
+
+        current_state = self._node_to_state_cache[node]
+        parent_info = helpers.get_parent_map(current_state, node)
+
+        while True:
+            if parent_info is None:
+                raise ValueError("Expected node to be in scope of stop_map_entry, but no parent map was found.")
+
+            entry, state = parent_info
+            exit_node = state.exit_node(entry)
+
+            entries.append(entry)
+            exits.append(exit_node)
+
+            if exit_node == stop_exit:
+                break
+
+            parent_info = helpers.get_parent_map(state, entry)
+
+        return entries, exits
+
+    def get_nearest_access_node(self, access_nodes: List[nodes.AccessNode], node: nodes.Node) -> nodes.AccessNode:
+        """
+        Finds the closest access node (by graph distance) to the given node
+        within the same state. Direction is ignored.
+
+        Args:
+            access_nodes: List of candidate AccessNodes to search from.
+            node: The node from which to start the search.
+
+        Returns:
+            The closest AccessNode (by number of edges traversed).
+
+        Raises:
+            RuntimeError: If no access node is conected in the node's state to the node.
+        """
+        state = self._node_to_state_cache[node]
+
+        visited = set()
+        queue = [node]
+        while queue:
+            current = queue.pop(0)
+            if current in access_nodes:
+                return current
+
+            visited.add(current)
+            for neighbor in state.neighbors(current):
+                if neighbor not in visited:
+                    queue.append(neighbor)
+
+        raise RuntimeError(f"No access node found connected to the given node {node}. ")
+
+    def in_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]:
+        """
+        Traces all incoming dataflow paths to the given AccessNode.
+        Only searches in the same state where the AccessNode is.
+
+        Returns:
+            A list of edge paths (each a list of edges).
+        """
+        state = self._node_to_state_cache[access_node]
+
+        # Start paths with in-edges to the access node.
+        initial_paths = [[edge] for edge in state.in_edges(access_node)]
+        queue = deque(initial_paths)
+        complete_paths = []
+
+        while queue:
+            # Get current path and see whether the starting node has in-edges carrying the access nodes data
+            current_path = queue.popleft()
+            first_edge = current_path[0]
+            current_node = first_edge.src
+            incoming_edges = [edge for edge in state.in_edges(current_node)]
+
+            # If no incoming edges found, this path is complete
+            if len(incoming_edges) == 0:
+
+                complete_paths.append(current_path)
+                continue
+
+            # Otherwise, extend the current path and add it to the queue for further processing
+            for edge in incoming_edges:
+                if edge in current_path:
+                    raise ValueError("Unexpected cycle detected")
+
+                extended_path = [edge] + current_path
+                queue.append(extended_path)
+
+        return complete_paths
+
+    def out_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]:
+        """
+        Traces all outgoing dataflow paths to the given AccessNode.
+        Only searches in the same state where the AccessNode is.
+
+        Returns:
+            A list of edge paths (each a list of edges).
+        """
+        state: SDFGState = self._node_to_state_cache[access_node]
+
+        initial_paths = [[edge] for edge in state.out_edges(access_node)]
+        queue = deque(initial_paths)
+        complete_paths = []
+
+        while queue:
+            # Get current path and see whether the last node has out-edges carrying the access nodes data
+            current_path = queue.popleft()
+            last_edge = current_path[-1]
+            current_node = last_edge.dst
+            outgoing_edges = [edge for edge in state.out_edges(current_node)]
+
+            # If no such edges found, this path is complete
+            if len(outgoing_edges) == 0:
+                complete_paths.append(current_path)
+                continue
+
+            # Otherwise, extend the current path and add it to the queue for further processing
+            for edge in outgoing_edges:
+
+                if edge in current_path:
+                    raise ValueError("Unexpected cycle detected")
+
+                extended_path = current_path + [edge]
+                queue.append(extended_path)
+
+        return complete_paths
diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py
new file mode 100644
index 0000000000..4f73d41ef9
--- /dev/null
+++ b/dace/transformation/passes/shared_memory_synchronization.py
@@ -0,0 +1,355 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+import warnings
+from typing import Dict, Set, Tuple
+
+import dace
+from dace import SDFG, SDFGState, dtypes, properties
+from dace.codegen.targets.experimental_cuda_helpers import gpu_utils
+from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, NestedSDFG, Node
+from dace.sdfg.state import LoopRegion
+from dace.transformation import helpers, pass_pipeline as ppl, transformation
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class DefaultSharedMemorySync(ppl.Pass):
+    """
+    This pass inserts synchronization tasklets that call "__syncthreads()".
+    This is for GPUs.
+
+    Synchronization is added after GPU_ThreadBlock (TB) MapExits if the TB map
+    writes to shared memory or after collaborative writes to shared memory (smem).
+
+    Important notes:
+    - Calling "__syncthreads()" inside a TB map can lead to deadlocks, 
+      for example when only a subset of threads participates (thread divergence). 
+      Therefore, users must **not** write to shared memory inside a Sequential 
+      map or LoopRegion that is nested within a TB map.
+
+    - If shared memory is still written sequentially within a TB map, the missing 
+      intermediate synchronizations may lead to race conditions and incorrect results. 
+      Because deadlocks are worse than race conditions, this pass avoids inserting 
+      synchronization inside TB maps, but it will warn the user about potential risks.
+
+    - When writing to and reading from shared memory within the same TB map, 
+      users must ensure that no synchronization is required, since barriers 
+      are not inserted automatically in this case (again, to avoid deadlocks). 
+      If synchronization is needed, the computation should instead be split 
+      across sequential TB maps. There is no warning for race conditions in this 
+      case for misbehavior.
+
+    - In nested TB maps (e.g., GPU_Device map -> TB map -> TB map ...), 
+      synchronization is only inserted at the outermost TB map's exit, 
+      again to avoid deadlocks.
+    """
+
+    def __init__(self):
+        """Initialize the synchronization pass."""
+        # Cache each node's parent state during apply_pass()
+        self._node_to_parent_state: Dict[Node, SDFGState] = dict()
+
+    def apply_pass(self, sdfg: SDFG, _) -> None:
+        """
+        Insert synchronization barriers (`__syncthreads()`) where needed to ensure
+        shared memory writes are synchronied for potential subsequent reads.
+
+        This pass performs the following steps:
+        1. Collect all ThreadBlock-scheduled MapExits and candidate collaborative
+           shared-memory writes (AccessNodes).
+        2. Analyze ThreadBlock MapExits for synchronization requirements.
+        3. Insert synchronization barriers after both MapExits and collaborative
+           shared-memory writes as needed.
+        """
+
+        # 1. Find all GPU_ThreadBlock-scheduled Maps and all collaborative writes to 
+        #    GPU shared memory, and cache each node's parent state for convenience.
+        tb_map_exits: Dict[MapExit, SDFGState] = dict()
+        collaborative_smem_copies: Dict[AccessNode, SDFGState] = dict()
+        for node, parent_state in sdfg.all_nodes_recursive():
+            self._node_to_parent_state[node] = parent_state
+            if isinstance(node, MapExit) and node.schedule == dtypes.ScheduleType.GPU_ThreadBlock:
+                tb_map_exits[node] = parent_state
+            elif isinstance(node, AccessNode) and self.is_collaborative_smem_write(node, parent_state):
+                collaborative_smem_copies[node] = parent_state
+
+
+        # 2. Identify TB MapExits requiring a synchronization barrier
+        sync_requiring_exits = self.identify_synchronization_tb_exits(tb_map_exits)
+
+        # 3. Insert synchronization barriers for previous TB MapExits
+        self.insert_synchronization_after_nodes(sync_requiring_exits)
+
+        # 4. Insert synchronization after collaborative shared memory writes
+        self.insert_synchronization_after_nodes(collaborative_smem_copies)
+
+    def is_collaborative_smem_write(self, node: AccessNode, state: SDFGState) -> bool:
+        """
+        Determine whether the given AccessNode corresponds to a collaborative
+        shared-memory (smem) write, i.e., whether it is written cooperatively
+        by GPU threads at the device level but not within a thread block map.
+
+        Parameters
+        ----------
+        node : AccessNode
+            The candidate access node.
+        state : SDFGState
+            The state in which the node resides.
+
+        Returns
+        -------
+        bool
+            True if the node is a collaborative smem write, False otherwise.
+        """
+        # 1. node is not stored in shared memory - skip
+        if node.desc(state).storage != dtypes.StorageType.GPU_Shared:
+            return False
+        
+        # 2. To my knowledge, it is not a collaborative write if the result comes from a ThreadBlock map.
+        if all(isinstance(pred, MapExit) and pred.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock 
+               for pred in state.predecessors(node)):
+            return False
+        
+        # 3. If all in edges are empty, there is no write - and no sync necessary
+        if all(edge.data.is_empty() for edge in state.in_edges(node)):
+            return False
+        
+        # 4. It is a collaborative copy if it is within a kernel but not within a GPU_ThreadBlock map
+        if (not gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_Device]) 
+            or gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_ThreadBlock])):
+            return False
+
+        return True
+        
+    def identify_synchronization_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> Dict[MapExit, SDFGState]:
+        """
+        Identify ThreadBlock exits after which "__syncthreads()" should be called.
+
+        Parameters
+        ----------
+        tb_map_exits : Dict[MapExit, SDFGState]
+            Mapping from GPU_ThreadBlock - scheduled MapExit nodes to their parent SDFGState.
+
+        Returns
+        -------
+        Dict[MapExit, SDFGState]
+            Subset of `tb_map_exits` where any AccessNode between the entry and exit
+            uses GPU shared memory, indicating a synchronization barrier is needed.
+        """
+        #------------------------- helper function -------------------------
+        sync_requiring_exits: Dict[MapExit, SDFGState] = {}
+
+        for map_exit, state in tb_map_exits.items():
+
+            # process
+            map_entry = state.entry_node(map_exit)
+            writes_to_smem, race_cond_danger, has_tb_parent = self.tb_exits_analysis(map_entry, map_exit, state)
+
+            # Skip: if this TB map is nested inside another TB map in the same kernel
+            # (i.e., before reaching the GPU_Device map), synchronization responsibility belongs
+            # to the outermost such TB map in the kernel.
+            if has_tb_parent:
+                continue
+
+            # Warn user: potential race condition detected.
+            elif race_cond_danger and writes_to_smem:
+                warnings.warn(
+                    f"Race condition danger: LoopRegion or Sequential Map inside ThreadBlock map {map_entry} "
+                    "writes to GPU shared memory. No synchronization occurs for intermediate steps, "
+                    "because '__syncthreads()' is only called outside the ThreadBlock map to avoid potential deadlocks."
+                    "Please consider moving the LoopRegion or Sequential Map outside the ThreadBlock map.")
+                sync_requiring_exits[map_exit] = state
+
+            # TB map writes to shared memory: synchronization is needed
+            elif writes_to_smem:
+                sync_requiring_exits[map_exit] = state
+
+        return sync_requiring_exits
+
+    def tb_exits_analysis(self, map_entry: MapEntry, map_exit: MapExit, state: SDFGState) -> Tuple[bool, bool, bool]:
+        """
+        Analyze a GPU_ThreadBlock-scheduled map to determine:
+        - whether it writes to shared memory,
+        - whether such writes may cause race conditions, and
+        - whether it is nested within another GPU_ThreadBlock map inside the kernel.
+
+        Returns a tuple of three booleans:
+
+        1. `writes_to_shared_memory`:
+            True if the map writes to GPU shared memory. This includes writes
+            directly at the MapExit or within the map scope.
+
+        2. `race_cond_danger`:
+           True if there is a potential race condition due to shared memory writes
+           inside either:
+             - a sequentially scheduled map, or
+             - a loop region.
+           (Note: single-iteration loops/sequential maps are not treated differently;
+           they are still marked as dangerous, even though they cannot cause races.)
+
+        3. `has_parent_tb_map`:
+           True if this ThreadBlock map is nested inside another ThreadBlock map
+           (i.e., there exists another TB map between the enclosing GPU_Device
+            map and the current TB map).
+
+        Parameters
+        ----------
+        map_entry : MapEntry
+            The entry node of the ThreadBlock map.
+        map_exit : MapExit
+            The exit node of the ThreadBlock map.
+        state : SDFGState
+            The parent state containing the map.
+
+        Returns
+        -------
+        Tuple[bool, bool, bool]
+            A tuple:
+            `(writes_to_shared_memory, race_cond_danger, has_parent_tb_map)`
+        """
+        # Initially, the flags are all set to False
+        writes_to_shared_memory = False
+        race_cond_danger = False
+        has_parent_tb_map = False
+
+        # 1. Check if the ThreadBlock (TB) map writes to shared memory
+        for edge in state.out_edges(map_exit):
+            is_smem: bool = (isinstance(edge.dst, AccessNode)
+                             and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared)
+            if is_smem and not edge.data.is_empty():
+                writes_to_shared_memory = True
+                break
+
+        # 2. Search between map entry and exit:
+        #    - Detect writes to shared memory (unless already found)
+        #    - Collect nested SDFGs for later analysis
+        nested_sdfgs: Set[NestedSDFG] = set()
+
+        for node in state.all_nodes_between(map_entry, map_exit):
+            if not writes_to_shared_memory and isinstance(node, AccessNode):
+                # Check if this AccessNode writes to shared memory
+                if (node.desc(state).storage == dtypes.StorageType.GPU_Shared
+                        and any(not edge.data.is_empty() for edge in state.in_edges(node))):
+                    writes_to_shared_memory = True
+
+            elif isinstance(node, NestedSDFG):
+                nested_sdfgs.add(node)
+
+        # 3. Recursively analyze nested SDFGs:
+        #    - Detect shared memory writes (only if not already found)
+        #    - Check for potential race conditions in loop regions (only if not already flagged)
+        for nsdfg in nested_sdfgs:
+            subs_sdfg = nsdfg.sdfg
+            if not writes_to_shared_memory:
+                writes_to_shared_memory = self.sdfg_writes_to_smem(subs_sdfg)
+
+            if not race_cond_danger:
+                race_cond_danger = self.writes_to_smem_inside_loopregion(subs_sdfg)
+
+        # 4. Check for race condition danger in sequential maps that use shared memory
+        #    (only if not already flagged)
+        if not race_cond_danger:
+            race_cond_danger = any(
+                inner_scope.map.schedule == dtypes.ScheduleType.Sequential and self.map_writes_to_smem(inner_scope)
+                for _, inner_scope in helpers.get_internal_scopes(state, map_entry))
+
+        # 5. Check if this TB map is nested within another TB map
+        parent = helpers.get_parent_map(state, map_entry)
+
+        while parent:
+            parent_map, parent_state = parent
+            if parent_map.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock:
+                has_parent_tb_map = True
+                break
+            if parent_map.map.schedule == dtypes.ScheduleType.GPU_Device:
+                break
+            parent = helpers.get_parent_map(parent_state, parent_map)
+
+        # 6. Return the results
+        return writes_to_shared_memory, race_cond_danger, has_parent_tb_map
+
+    def writes_to_smem_inside_loopregion(self, sdfg: SDFG) -> bool:
+        """
+        Return True if the SDFG writes to GPU shared memory (smem) inside
+        a LoopRegion. This check is recursive and includes nested SDFGs.
+        """
+        for node in sdfg.nodes():
+            if isinstance(node, LoopRegion):
+                # Traverse all nodes inside the loop region
+                for subnode, parent in node.all_nodes_recursive():
+                    if (isinstance(subnode, AccessNode)
+                            and subnode.desc(parent).storage == dtypes.StorageType.GPU_Shared
+                            and any(not edge.data.is_empty() for edge in parent.in_edges(node))):
+                        return True
+
+            elif isinstance(node, NestedSDFG):
+                # Recurse into nested SDFGs
+                if self.writes_to_smem_inside_loopregion(node.sdfg):
+                    return True
+
+        return False
+
+    def sdfg_writes_to_smem(self, sdfg: SDFG) -> bool:
+        """
+        Return True if the SDFG writes to GPU shared memory (smem),
+        i.e., contains an AccessNode with GPU_Shared storage that has
+        at least one non-empty incoming edge.
+        """
+        for node, state in sdfg.all_nodes_recursive():
+            if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Shared
+                    and any(not edge.data.is_empty() for edge in state.in_edges(node))):
+                return True
+        return False
+
+    def map_writes_to_smem(self, map_entry: MapEntry) -> bool:
+        """
+        Return True if the map writes to GPU shared memory (smem).
+
+        A map is considered to write to smem if:
+        - Any AccessNode with GPU_Shared storage is written to at the MapExit, or
+        - Such writes occur within the map scope, or
+        - A nested SDFG within the map writes to smem.
+        """
+        state = self._node_to_parent_state[map_entry]
+        map_exit = state.exit_node(map_entry)
+
+        # 1. Check if MapExit writes directly to shared memory
+        for edge in state.out_edges(map_exit):
+            if (isinstance(edge.dst, AccessNode) and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared
+                    and not edge.data.is_empty()):
+                return True
+
+        # 2. Inspect nodes inside the map scope
+        for node in state.all_nodes_between(map_entry, map_exit):
+            if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Shared
+                    and any(not edge.data.is_empty() for edge in state.in_edges(node))):
+                return True
+
+            if isinstance(node, NestedSDFG) and self.sdfg_writes_to_smem(node.sdfg):
+                return True
+
+        # No writes to shared memory found
+        return False
+
+    def insert_synchronization_after_nodes(self, nodes: Dict[Node, SDFGState]) -> None:
+        """
+        Insert synchronization tasklets (calling `__syncthreads()`) after the given
+        GPU-related nodes.
+
+        Parameters
+        ----------
+        nodes : Dict[Node, SDFGState]
+            Mapping from SDFG nodes to their parent states after which a
+            synchronization tasklet should be inserted.
+        """
+        for node, state in nodes.items():
+
+            sync_tasklet = state.add_tasklet(name="sync_threads",
+                                             inputs=set(),
+                                             outputs=set(),
+                                             code="__syncthreads();\n",
+                                             language=dtypes.Language.CPP)
+
+            for succ in state.successors(node):
+                state.add_edge(sync_tasklet, None, succ, None, dace.Memlet())
+
+            state.add_edge(node, None, sync_tasklet, None, dace.Memlet())
diff --git a/tests/codegen/cuda_mempool_test.py b/tests/codegen/cuda_mempool_test.py
index eccd97ee61..128634720c 100644
--- a/tests/codegen/cuda_mempool_test.py
+++ b/tests/codegen/cuda_mempool_test.py
@@ -144,7 +144,8 @@ def tester(A: CudaArray, B: CudaArray):
 
     code = sdfg.generate_code()[0].clean_code
     assert code.count('cudaMallocAsync') == 1
-    assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1
+    assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1 or code.count(
+        'cudaFreeAsync(pooled, gpu_stream0') == 1
 
     # Test code
     import cupy as cp
@@ -198,7 +199,8 @@ def test_memory_pool_if_states(cnd):
     sdfg.validate()
     code = sdfg.generate_code()[0].clean_code
     assert code.count('cudaMallocAsync') == 1
-    assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1
+    assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1 or code.count(
+        f'cudaFreeAsync({tmp}, gpu_stream0') == 1
 
     # Test code
     import cupy as cp
diff --git a/tests/codegen/gpu_memcpy_test.py b/tests/codegen/gpu_memcpy_test.py
index c7a3525f95..1cc650ffaa 100644
--- a/tests/codegen/gpu_memcpy_test.py
+++ b/tests/codegen/gpu_memcpy_test.py
@@ -15,11 +15,14 @@
 rng = cp.random.default_rng(42)
 
 
-def count_node(sdfg: dace.SDFG, node_type):
+def count_node(sdfg: dace.SDFG, node_type, ignore_gpustream_nodes=True):
     nb_nodes = 0
     for rsdfg in sdfg.all_sdfgs_recursive():
         for state in sdfg.states():
             for node in state.nodes():
+                if (ignore_gpustream_nodes and isinstance(node, dace_nodes.AccessNode)
+                        and node.desc(state).dtype == dace.dtypes.gpuStream_t):
+                    continue
                 if isinstance(node, node_type):
                     nb_nodes += 1
     return nb_nodes
diff --git a/tests/codegen/nested_kernel_transient_test.py b/tests/codegen/nested_kernel_transient_test.py
index 54488a3aac..d4c3182c16 100644
--- a/tests/codegen/nested_kernel_transient_test.py
+++ b/tests/codegen/nested_kernel_transient_test.py
@@ -24,7 +24,15 @@ def nested(A: dace.float64[128, 64]):
     state.add_edge(n, 'A', w, None, dace.Memlet('A'))
 
     if persistent:
-        sdfg.arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent
+        arrays = sdfg.cfg_list[-1].arrays
+        if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental':
+            # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays
+            # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the
+            # form local_{counter}_gpuA
+            target_name = next(k for k in arrays if "gpu_A" in k)
+        else:
+            target_name = "gpu_A"
+        arrays[target_name].lifetime = dace.AllocationLifetime.Persistent
 
     a = np.random.rand(128, 64)
     expected = np.copy(a)
@@ -50,7 +58,15 @@ def transient(A: dace.float64[128, 64]):
     sdfg.apply_gpu_transformations()
 
     if persistent:
-        sdfg.cfg_list[-1].arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent
+        arrays = sdfg.cfg_list[-1].arrays
+        if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental':
+            # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays
+            # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the
+            # form local_{counter}_gpuA
+            target_name = next(k for k in arrays if "gpu_A" in k)
+        else:
+            target_name = "gpu_A"
+        arrays[target_name].lifetime = dace.AllocationLifetime.Persistent
 
     a = np.random.rand(128, 64)
     expected = np.copy(a)
@@ -87,7 +103,15 @@ def transient(A: dace.float64[128, 64]):
     sdfg.apply_gpu_transformations()
 
     if persistent:
-        sdfg.cfg_list[-1].arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent
+        arrays = sdfg.cfg_list[-1].arrays
+        if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental':
+            # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays
+            # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the
+            # form local_{counter}_gpuA
+            target_name = next(k for k in arrays if "gpu_A" in k)
+        else:
+            target_name = "gpu_A"
+        arrays[target_name].lifetime = dace.AllocationLifetime.Persistent
 
     a = np.random.rand(128, 64)
     expected = np.copy(a)
diff --git a/tests/cuda_block_test.py b/tests/cuda_block_test.py
index 8b75376a00..74ee21fd90 100644
--- a/tests/cuda_block_test.py
+++ b/tests/cuda_block_test.py
@@ -181,6 +181,7 @@ def tester(A: dace.float64[200]):
     tasklet.location['gpu_block'] = 1
 
     code = sdfg.generate_code()[1].clean_code  # Get GPU code (second file)
+    sdfg.compile()
     assert '>= 2' in code and '<= 8' in code
     assert ' == 1' in code
 
diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py
index f8553249ea..2a64cd2255 100644
--- a/tests/parse_state_struct_test.py
+++ b/tests/parse_state_struct_test.py
@@ -10,7 +10,7 @@
 
 import dace
 import dace.library
-from dace import dtypes
+from dace import dtypes, Config
 from dace.codegen import codeobject, targets, compiler, compiled_sdfg, common
 
 
@@ -31,9 +31,14 @@ def _cuda_helper():
         }}
     }}
     """
-    program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper")
 
-    dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy")
+    if Config.get('compiler', 'cuda', 'implementation') == 'experimental':
+        program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper")
+        dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.experimental_cuda.ExperimentalCUDACodeGen,
+                                                  "CudaDummy")
+    else:
+        program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper")
+        dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy")
 
     build_folder = dace.Config.get('default_build_folder')
     BUILD_PATH = os.path.join(build_folder, "cuda_helper")
diff --git a/tests/passes/gpu_specialization/gpu_stream_test.py b/tests/passes/gpu_specialization/gpu_stream_test.py
new file mode 100644
index 0000000000..07d1facdf9
--- /dev/null
+++ b/tests/passes/gpu_specialization/gpu_stream_test.py
@@ -0,0 +1,116 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+import pytest
+
+import dace
+from dace.codegen import common
+from dace.transformation.pass_pipeline import Pipeline
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets
+from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets
+from dace.transformation.passes.gpu_specialization.insert_gpu_copy_tasklet import InsertGPUCopyTasklets
+from dace.transformation.passes.gpu_specialization.gpu_stream_topology_simplification import GPUStreamTopologySimplification
+
+gpu_stream_pipeline = Pipeline([
+    NaiveGPUStreamScheduler(),
+    InsertGPUStreamsToSDFGs(),
+    ConnectGPUStreamsToKernels(),
+    ConnectGPUStreamsToTasklets(),
+    InsertGPUStreamSyncTasklets(),
+    InsertGPUCopyTasklets(),
+    GPUStreamTopologySimplification(),
+])
+
+backend = common.get_gpu_backend()
+
+
+@pytest.mark.gpu
+def test_basic():
+    """
+    A simple memory copy program.
+
+    Since the SDFG has a single connected component, exactly one GPU stream is used
+    and must be synchronized at the end of the state. For each synchronized stream,
+    the pipeline introduces a memlet from the synchronization tasklet to a GPU stream
+    AccessNode. Therefore, it is sufficient to verify there is only one sink node with one ingoing
+    edge, verify its dtype, and check for the presence of a preceeding synchronization tasklet.
+    """
+
+    @dace.program
+    def simple_copy(A: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global,
+                    B: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global):
+        for i in dace.map[0:128:1] @ dace.dtypes.ScheduleType.GPU_Device:
+            B[i] = A[i]
+
+    sdfg = simple_copy.to_sdfg()
+    gpu_stream_pipeline.apply_pass(sdfg, {})
+
+    state = sdfg.states()[0]
+    sink_nodes = state.sink_nodes()
+    node = sink_nodes[0]
+    assert (
+        len(sink_nodes) == 1 and len(state.in_edges(node)) == 1 and isinstance(node, dace.nodes.AccessNode)
+        and node.desc(state).dtype == dace.dtypes.gpuStream_t
+    ), ("Only one sink node with should exist, which is a GPU stream AccessNode and it should have one ingoing edge.")
+
+    assert (isinstance(pre, dace.nodes.Tasklet) and f"{backend}StreamSynchronize(" in pre.code.as_string
+            for pre in state.predecessors(node)), ("At then end of each state any used stream must be synchronized.")
+
+
+@pytest.mark.gpu
+def test_extended():
+    """
+    A program that performs two independent memory copies.
+
+    The input arrays reside in host memory, and `gpu_transformations()` is applied to
+    the program. As a result, the data is first copied to GPU global memory, after
+    which the two copies are executed on the GPU. Since these copies form two
+    independent connected components in the resulting SDFG, the naive GPU stream
+    scheduler assigns them to different GPU streams.
+
+    This test verifies that exactly two GPU streams are used, that both streams are
+    synchronized at the end of the state, and that the corresponding asynchronous
+    memory copy tasklets are correctly associated with their assigned streams.
+    """
+
+    @dace.program
+    def independent_copies(A: dace.uint32[128], B: dace.uint32[128], C: dace.uint32[128], D: dace.uint32[128]):
+        for i in dace.map[0:128:1]:
+            B[i] = A[i]
+        for i in dace.map[0:128:1]:
+            D[i] = C[i]
+
+    sdfg = independent_copies.to_sdfg()
+
+    # Transform such that program can run on GPU and apply GPU stream pipeline
+    sdfg.apply_gpu_transformations()
+    gpu_stream_pipeline.apply_pass(sdfg, {})
+
+    # Test 1: Two GPU streams were used since we use the Naive Stream scheduler
+    state = sdfg.states()[0]
+    sink_nodes = state.sink_nodes()
+    node = sink_nodes[0]
+    assert (len(sink_nodes) == 1 and len(state.in_edges(node)) == 2 and isinstance(node, dace.nodes.AccessNode)
+            and node.desc(state).dtype == dace.dtypes.gpuStream_t), (
+                "Only one sink node with should exist, which is a GPU stream AccessNode and it "
+                "should have two ingoing edges as original graph consisted of two connected components.")
+
+    # Test 2: We synchronize at the end of the state
+    assert (isinstance(pre, dace.nodes.Tasklet) and f"{backend}StreamSynchronize(" in pre.code.as_string
+            for pre in state.predecessors(node)), ("At then end of each state any used stream must be synchronized.")
+
+    # Test 3: Check that we have memory copy tasklets (as we perform two "Main Memory -> GPU GLobal"
+    # memory copies and two "GPU Global -> Main Memory" memory copies by applying the gpu tranformation)
+    # and that they use the name of the in connector of the GPU stream in the copy call
+    memcopy_tasklets = [
+        n for n in state.nodes() if isinstance(n, dace.nodes.Tasklet) and f"{backend}MemcpyAsync(" in n.code.as_string
+    ]
+    for tasklet in memcopy_tasklets:
+        assert len(tasklet.in_connectors) == 1, ("Memcpy tasklets must have exactly one input connector "
+                                                 "corresponding to the GPU stream.")
+
+        in_connector = next(iter(tasklet.in_connectors))
+
+        assert in_connector in tasklet.code.as_string, (
+            "Memcpy tasklets must reference their GPU stream input connector in the memcpy call.")