spcl · ThrudPrimrose · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
@@ -98,7 +98,8 @@ def add(self, name: str, dtype: DefinedType, ctype: str, ancestor: int = 0, allo
         for _, scope, can_access_parent in reversed(self._scopes):
             if name in scope:
                 err_str = "Shadowing variable {} from type {} to {}".format(name, scope[name], dtype)
-                if (allow_shadowing or config.Config.get_bool("compiler", "allow_shadowing")):
+                if (allow_shadowing or config.Config.get_bool("compiler", "allow_shadowing")
+                        or dtype == DefinedType.GPUStream):
                     if not allow_shadowing:
                         print("WARNING: " + err_str)
                 else:

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
@@ -218,14 +218,17 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher',
 
 def is_cuda_codegen_in_device(framecode) -> bool:
     """
-    Check the state of the CUDA code generator, whether it is inside device code.
+    Check the state of the (Experimental) CUDA code generator, whether it is inside device code.
     """
     from dace.codegen.targets.cuda import CUDACodeGen
+
+    cudaClass = CUDACodeGen
+
     if framecode is None:
         cuda_codegen_in_device = False
     else:
         for codegen in framecode.targets:
-            if isinstance(codegen, CUDACodeGen):
+            if isinstance(codegen, cudaClass):
                 cuda_codegen_in_device = codegen._in_device_code
                 break
         else:
@@ -248,11 +251,9 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode: 'DaCeCodeGener
         root = name.split('.')[0]
         if root in sdfg.arrays and isinstance(sdfg.arrays[root], data.Structure):
             name = name.replace('.', '->')
-
     # Special case: If memory is persistent and defined in this SDFG, add state
     # struct to name
     if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)):
-
         if desc.storage == dtypes.StorageType.CPU_ThreadLocal:  # Use unambiguous name for thread-local arrays
             return f'__{sdfg.cfg_id}_{name}'
         elif not is_cuda_codegen_in_device(framecode):  # GPU kernels cannot access state
@@ -807,9 +808,12 @@ def unparse_cr(sdfg, wcr_ast, dtype):
 def connected_to_gpu_memory(node: nodes.Node, state: SDFGState, sdfg: SDFG):
     for e in state.all_edges(node):
         path = state.memlet_path(e)
-        if ((isinstance(path[0].src, nodes.AccessNode)
-             and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)):
+        if (((isinstance(path[0].src, nodes.AccessNode)
+              and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global))
+                or ((isinstance(path[-1].dst, nodes.AccessNode)
+                     and path[-1].dst.desc(sdfg).storage is dtypes.StorageType.GPU_Global))):
             return True
+
     return False
 
 

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
@@ -502,6 +502,14 @@ def allocate_array(self,
 
             return
         elif (nodedesc.storage == dtypes.StorageType.Register):
+            # The assignment necessary to unify the explicit streams and streams declared through
+            # the state of the SDFG.
+            if nodedesc.dtype == dtypes.gpuStream_t:
+                ctype = dtypes.gpuStream_t.ctype
+                allocation_stream.write(f"{ctype}* {name} = __state->gpu_context->streams;")
+                define_var(name, DefinedType.Pointer, ctype)
+                return
+
             ctypedef = dtypes.pointer(nodedesc.dtype).ctype
             if nodedesc.start_offset != 0:
                 raise NotImplementedError('Start offset unsupported for registers')
@@ -577,6 +585,9 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap
 
         if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)):
             return
+        elif nodedesc.dtype == dtypes.gpuStream_t:
+            callsite_stream.write(f"{alloc_name} = nullptr;")
+            return
         elif (nodedesc.storage == dtypes.StorageType.CPU_Heap
               or (nodedesc.storage == dtypes.StorageType.Register and
                   (symbolic.issymbolic(arrsize, sdfg.constants) or
@@ -994,6 +1005,11 @@ def process_out_memlets(self,
             dst_edge = dfg.memlet_path(edge)[-1]
             dst_node = dst_edge.dst
 
+            if isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state).dtype == dtypes.gpuStream_t:
+                # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks
+                # Thus, nothing needs to be written and out memlets of this kind should be ignored.
+                continue
+
             # Target is neither a data nor a tasklet node
             if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode)
                                                        and not isinstance(dst_node, nodes.CodeNode)):
@@ -1035,8 +1051,7 @@ def process_out_memlets(self,
             # Tasklet -> array with a memlet. Writing to array is emitted only if the memlet is not empty
             if isinstance(node, nodes.CodeNode) and not edge.data.is_empty():
                 if not uconn:
-                    raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format(
-                        str(edge.src), str(edge.dst)))
+                    return
 
                 conntype = node.out_connectors[uconn]
                 is_scalar = not isinstance(conntype, dtypes.pointer)
@@ -1254,7 +1269,6 @@ def memlet_definition(self,
                     # Dynamic WCR memlets start uninitialized
                     result += "{} {};".format(memlet_type, local_name)
                     defined = DefinedType.Scalar
-
             else:
                 if not memlet.dynamic:
                     if is_scalar:
@@ -1289,8 +1303,12 @@ def memlet_definition(self,
                 memlet_type = ctypedef
                 result += "{} &{} = {};".format(memlet_type, local_name, expr)
                 defined = DefinedType.Stream
-        else:
-            raise TypeError("Unknown variable type: {}".format(var_type))
+
+        # Set Defined Type for GPU Stream connectors
+        # Shadowing for stream variable needs to be allowed
+        if memlet_type == 'gpuStream_t':
+            var_type = DefinedType.GPUStream
+            defined = DefinedType.GPUStream
 
         if defined is not None:
             self._dispatcher.defined_vars.add(local_name, defined, memlet_type, allow_shadowing=allow_shadowing)

diff --git a/dace/config_schema.yml b/dace/config_schema.yml
@@ -268,7 +268,7 @@ required:
                         type: str
                         title: Arguments
                         description: Compiler argument flags
-                        default: '-fPIC -Wall -Wextra -O3 -march=native -ffast-math -Wno-unused-parameter -Wno-unused-label'
+                        default: '-fopenmp -fPIC -Wall -Wextra -O3 -march=native -ffast-math -Wno-unused-parameter -Wno-unused-label'
                         default_Windows: '/O2 /fp:fast /arch:AVX2 /D_USRDLL /D_WINDLL /D__restrict__=__restrict'
 
                     libs:
@@ -326,7 +326,7 @@ required:
                             Additional CUDA architectures (separated by commas)
                             to compile GPU code for, excluding the current
                             architecture on the compiling machine.
-                        default: '60'
+                        default: '86'
 
                     hip_arch:
                         type: str

diff --git a/dace/dtypes.py b/dace/dtypes.py
@@ -71,6 +71,7 @@ class ScheduleType(AutoNumberEnum):
     GPU_ThreadBlock = ()  #: Thread-block code
     GPU_ThreadBlock_Dynamic = ()  #: Allows rescheduling work within a block
     GPU_Persistent = ()
+    GPU_Warp = ()
 
     Snitch = ()
     Snitch_Multicore = ()
@@ -84,6 +85,11 @@ class ScheduleType(AutoNumberEnum):
     ScheduleType.GPU_Persistent,
 ]
 
+# A subset of GPU schedule types for ExperimentalCUDACodeGen
+EXPERIMENTAL_GPU_SCHEDULES = [
+    ScheduleType.GPU_Warp,
+]
+
 # A subset of CPU schedule types
 CPU_SCHEDULES = [
     ScheduleType.CPU_Multicore,
@@ -95,6 +101,8 @@ class ScheduleType(AutoNumberEnum):
     StorageType.GPU_Shared,
 ]
 
+GPU_KERNEL_ACCESSIBLE_STORAGES = [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.Register]
+
 
 @undefined_safe_enum
 class ReductionType(AutoNumberEnum):
@@ -192,7 +200,8 @@ class TilingType(AutoNumberEnum):
     ScheduleType.GPU_ThreadBlock: StorageType.Register,
     ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register,
     ScheduleType.SVE_Map: StorageType.CPU_Heap,
-    ScheduleType.Snitch: StorageType.Snitch_TCDM
+    ScheduleType.Snitch: StorageType.Snitch_TCDM,
+    ScheduleType.GPU_Warp: StorageType.Register,
 }
 
 # Maps from ScheduleType to default ScheduleType for sub-scopes
@@ -207,9 +216,10 @@ class TilingType(AutoNumberEnum):
     ScheduleType.GPU_Device: ScheduleType.GPU_ThreadBlock,
     ScheduleType.GPU_ThreadBlock: ScheduleType.Sequential,
     ScheduleType.GPU_ThreadBlock_Dynamic: ScheduleType.Sequential,
+    ScheduleType.GPU_Warp: ScheduleType.Sequential,
     ScheduleType.SVE_Map: ScheduleType.Sequential,
     ScheduleType.Snitch: ScheduleType.Snitch,
-    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore
+    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore,
 }
 
 # Maps from StorageType to a preferred ScheduleType for helping determine schedules.
@@ -1240,6 +1250,7 @@ class string(_DaCeArray, npt.NDArray[numpy.str_]): ...
     class vector(_DaCeArray, npt.NDArray[numpy.void]): ...
     class MPI_Request(_DaCeArray, npt.NDArray[numpy.void]): ...
     class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
+    class gpuStream_t(_DaCeArray, npt.NDArray[numpy.void]): ...
     # yapf: enable
 else:
     # Runtime definitions
@@ -1260,6 +1271,7 @@ class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
     complex128 = typeclass(numpy.complex128)
     string = stringtype()
     MPI_Request = opaque('MPI_Request')
+    gpuStream_t = opaque('gpuStream_t')
     float32sr = Float32sr()
 
 
@@ -1281,6 +1293,7 @@ class Typeclasses(AutoNumberEnum):
     float64 = float64
     complex64 = complex64
     complex128 = complex128
+    gpuStream_t = gpuStream_t
 
 
 _bool = bool
@@ -1508,6 +1521,7 @@ def can_access(schedule: ScheduleType, storage: StorageType):
             ScheduleType.GPU_Persistent,
             ScheduleType.GPU_ThreadBlock,
             ScheduleType.GPU_ThreadBlock_Dynamic,
+            ScheduleType.GPU_Warp,
     ]:
         return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned]
     elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]:

diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
@@ -406,6 +406,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto
         if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()):
             return result
 
+        # For the explicit (new) gpu stream handling we can have dynamic out connectors, e.g.
+        # KernelExit: stream ->  None: AccessNode, where AccessNode accesses a Stream array
+        # Memlets are used but its not about seing how data flows
+        if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device
+                and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t):
+            return result
+
         # Prepend incoming edges until reaching the source node
         curedge = edge
         visited = set()

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
@@ -858,9 +858,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                          for oe in state.out_edges(dst_node)}):
                         pass
                 else:
-                    raise InvalidSDFGEdgeError(
-                        f"Memlet creates an invalid path (sink node {dst_node}"
-                        " should be a data node)", sdfg, state_id, eid)
+                    if isinstance(dst_node, nd.Tasklet) and len(dst_node.in_connectors) == 0 and len(
+                            dst_node.out_connectors) == 0:
+                        # Tasklets with no input or output connector -> sync tasklet -> OK
+                        pass
+                    else:
+                        raise InvalidSDFGEdgeError(
+                            f"Memlet creates an invalid path (sink node {dst_node}"
+                            " should be a data node)", sdfg, state_id, eid)
         # If scope(dst) is disjoint from scope(src), it's an illegal memlet
         else:
             raise InvalidSDFGEdgeError("Illegal memlet between disjoint scopes", sdfg, state_id, eid)

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
@@ -1959,3 +1959,33 @@ def _is_pointer(obj) -> bool:
 def _is_structure_view(obj) -> bool:
     """Check if object is a StructureView."""
     return isinstance(obj, data.StructureView)
+
+
+def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
+    """
+    Checks if the given node is enclosed within a Map whose schedule type
+    matches any in the `schedules` set.
+    Parameters
+    ----------
+    state : SDFGState
+        The State where the node resides
+    node : nodes.Node
+        The node to check.
+    schedules : set[dtypes.ScheduleType]
+        A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).
+    Returns
+    ----------
+    bool
+        True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
+    """
+    current = node
+
+    while current is not None:
+        if isinstance(current, nodes.MapEntry):
+            if current.map.schedule in schedules:
+                return True
+
+        parent = get_parent_map(state, current)
+        if parent is None:
+            return False
+        current, state = parent
diff --git a/dace/transformation/passes/gpu_specialization/__init__.py b/dace/transformation/passes/gpu_specialization/__init__.py
@@ -0,0 +1 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py
@@ -0,0 +1,69 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, Set, Type, Union
+
+import dace
+from dace import dtypes, properties, SDFG
+from dace.codegen import common
+from dace.config import Config
+from dace.sdfg import nodes
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
+from dace.transformation.passes.gpu_specialization.insert_gpu_streams import InsertGPUStreams, get_gpu_stream_array_name, get_gpu_stream_connector_name
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class ConnectGPUStreamsToKernels(ppl.Pass):
+    """
+    This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).
+
+    Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
+    indicating which GPU stream each kernel is assigned to. These assignments are e.g.
+    used when launching the kernels.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        return {NaiveGPUStreamScheduler, InsertGPUStreams}
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
+        # Retrieve the GPU stream array name and the prefix for individual stream variables
+        stream_array_name = get_gpu_stream_array_name()
+        stream_var_name_prefix = get_gpu_stream_connector_name()
+
+        # Retrieve GPU stream assignments for nodes
+        stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+
+        # Link kernels to their assigned GPU streams
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+
+            for state in sub_sdfg.states():
+                for node in state.nodes():
+
+                    # Not a kernel entry - continue
+                    if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
+                        continue
+
+                    # Stream connector name and the used GPU Stream for the kernel
+                    assigned_gpustream = stream_assignments[node]
+                    gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
+                    accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"
+
+                    # Assign the GPU stream to the kernel entry
+                    kernel_entry = node
+                    kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
+                    stream_array_in = state.add_access(stream_array_name)
+                    state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
+                                   dace.Memlet(accessed_gpu_stream))
+
+                    # Assign the GPU stream to the kernel exit
+                    kernel_exit = state.exit_node(kernel_entry)
+                    stream_array_out = state.add_access(stream_array_name)
+                    state.add_edge(kernel_exit, None, stream_array_out, None, dace.Memlet(None))
+
+        return {}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.