spcl · aydogdub · Nov 21, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 22, 2025
diff --git a/dace/codegen/targets/gpu_helpers/copy_strategies.py b/dace/codegen/targets/gpu_helpers/copy_strategies.py
diff --git a/dace/codegen/targets/gpu_helpers/gpu_utils.py b/dace/codegen/targets/gpu_helpers/gpu_utils.py
@@ -0,0 +1,27 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from dace import Config
+from dace.codegen import common
+
+
+def generate_sync_debug_call() -> str:
+    """
+    Generate backend sync and error-check calls as a string if
+    synchronous debugging is enabled.
+
+    Parameters
+    ----------
+    backend : str
+        Backend API prefix (e.g., 'cuda').
+
+    Returns
+    -------
+    str
+        The generated debug call code, or an empty string if debugging is disabled.
+    """
+    backend: str = common.get_gpu_backend()
+    sync_call: str = ""
+    if Config.get_bool('compiler', 'cuda', 'syncdebug'):
+        sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
+                     f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")
+
+    return sync_call
diff --git a/dace/config_schema.yml b/dace/config_schema.yml
@@ -465,6 +465,17 @@ required:
                             will raise an exception if such a Memlet is encountered. This allows the user
                             to have full control over all Maps in the SDFG.
 
+                    # New configs, needed for new CUDACodeGen
+                    gpu_stream_name:
+                        type: str
+                        title: Name for the GPU stream object
+                        description: >
+                            GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously
+                            and in parallel. This field specifies the naming convention for the hpu stream array and its connectors
+                            in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the
+                            stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a
+                            connector for gpu_streams[0].
+                        default: gpu_streams,gpu_stream
 
             #############################################
             # General FPGA flags

diff --git a/dace/dtypes.py b/dace/dtypes.py
@@ -87,6 +87,18 @@ class ScheduleType(aenum.AutoNumberEnum):
     ScheduleType.GPU_Persistent,
 ]
 
+# A subset of GPU schedule types for the new GPU backend
+GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN = [
+    ScheduleType.GPU_Device,
+    ScheduleType.GPU_ThreadBlock,
+]
+
+# A subset of on-GPU storage types for the new GPU backend
+GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN = [
+    StorageType.GPU_Global,
+    StorageType.GPU_Shared,
+]
+
 # A subset of CPU schedule types
 CPU_SCHEDULES = [
     ScheduleType.CPU_Multicore,
@@ -1266,6 +1278,7 @@ def isconstant(var):
 complex128 = typeclass(numpy.complex128)
 string = stringtype()
 MPI_Request = opaque('MPI_Request')
+gpuStream_t = opaque('gpuStream_t')
 
 
 @undefined_safe_enum
@@ -1286,6 +1299,7 @@ class Typeclasses(aenum.AutoNumberEnum):
     float64 = float64
     complex64 = complex64
     complex128 = complex128
+    gpuStream_t = gpuStream_t
 
 
 _bool = bool

diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
@@ -405,6 +405,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto
         if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()):
             return result
 
+        # For the gpu stream (i.e. cudastream, hipstream) management we can have dynamic out connectors, e.g.
+        # (GPU_Device-scheduled) MapExit: stream ->  None: AccessNode, where AccessNode accesses a Stream array
+        # Memlets are used but its not about seing how data flows
+        if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device
+                and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t):
+            return result
+
         # Prepend incoming edges until reaching the source node
         curedge = edge
         visited = set()

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
@@ -1552,6 +1552,38 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio
     return None
 
 
+def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
+    """
+    Checks if the given node is enclosed within a Map whose schedule type
+    matches any in the `schedules` set.
+
+    Parameters
+    ----------
+    state : SDFGState
+        The State where the node resides
+    node : nodes.Node
+        The node to check.
+    schedules : set[dtypes.ScheduleType]
+        A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).
+
+    Returns
+    ----------
+    bool
+        True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
+    """
+    current = node
+
+    while current is not None:
+        if isinstance(current, nodes.MapEntry):
+            if current.map.schedule in schedules:
+                return True
+
+        parent = get_parent_map(state, current)
+        if parent is None:
+            return False
+        current, state = parent
+
+
 def redirect_edge(state: SDFGState,
                   edge: graph.MultiConnectorEdge[Memlet],
                   new_src: Optional[nodes.Node] = None,

diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py
@@ -0,0 +1,70 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.codegen import common
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class ConnectGPUStreamsToKernels(ppl.Pass):
+    """
+    This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).
+
+    Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
+    indicating which GPU stream each kernel is assigned to. These assignments are e.g.
+    used when launching the kernels.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream array name and the prefix for individual stream variables
+        stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Link kernels to their assigned GPU streams
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a kernel entry - continue
+                    if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Assign the GPU stream to the kernel entry
+                    kernel_entry = node
+                    kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_in = state.add_access(stream_array_name)
+                    state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
+                                   dace.Memlet(accessed_gpu_stream))
+
+                    # Assign the GPU stream to the kernel exit
+                    kernel_exit = state.exit_node(kernel_entry)
+                    kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_out = state.add_access(stream_array_name)
+                    state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None,
+                                   dace.Memlet(accessed_gpu_stream))
+
+        return {}
diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py
@@ -0,0 +1,80 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
+from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels
+
+# Placeholder for the GPU stream variable used in tasklet code
+STREAM_PLACEHOLDER = "__dace_current_stream"
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class ConnectGPUStreamsToTasklets(ppl.Pass):
+    """
+    This pass ensures that tasklets which require access to their assigned GPU stream
+    are provided with it explicitly.
+
+    Such tasklets typically originate from expanded LibraryNodes targeting GPUs.
+    These nodes may reference the special placeholder variable `__dace_current_stream`,
+    which is expected to be defined during unparsing in `cpp.py`.
+
+    To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use
+    the GPU stream AccessNode directly.
+
+    Note that this pass is similar to `ConnectGPUStreamsToKernels`.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream's array name
+        stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code
+        # and provide them the needed GPU stream explicitly
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a tasklet - continue
+                    if not isinstance(node, nodes.Tasklet):
+                        continue
+
+                    # Tasklet does not need use its assigned GPU stream - continue
+                    if not STREAM_PLACEHOLDER in node.code.as_string:
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_conn = STREAM_PLACEHOLDER
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Provide the GPU stream explicitly to the tasklet
+                    stream_array_in = state.add_access(stream_array_name)
+                    stream_array_out = state.add_access(stream_array_name)
+
+                    node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t)
+                    node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True)
+
+                    state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream))
+                    state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream))
+
+        return {}