spcl · ThrudPrimrose · May 8, 2025 · May 8, 2025 · May 12, 2025 · May 12, 2025
diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt
@@ -58,7 +58,8 @@ foreach(DACE_FILE ${DACE_FILES})
   # Make the path absolute
   set(DACE_FILE ${DACE_SRC_DIR}/${DACE_FILE})
   # Now treat the file according to the deduced target
-  if(${DACE_FILE_TARGET} STREQUAL "cuda")
+  # previous: if(${DACE_FILE_TARGET} STREQUAL "cuda"). Needed to work with experimental
+  if(${DACE_FILE_TARGET} STREQUAL "experimental_cuda" OR ${DACE_FILE_TARGET} STREQUAL "cuda")
     if(${DACE_FILE_TARGET_TYPE} MATCHES "hip")
       set(DACE_ENABLE_HIP ON)
       set(DACE_HIP_FILES ${DACE_HIP_FILES} ${DACE_FILE})

diff --git a/dace/codegen/instrumentation/gpu_events.py b/dace/codegen/instrumentation/gpu_events.py
@@ -129,7 +129,7 @@ def on_scope_entry(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, n
                                 'GPU_Device map scopes')
 
             idstr = 'b' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
 
     def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.ExitNode,
@@ -139,7 +139,7 @@ def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no
         s = self._get_sobj(node)
         if s.instrument == dtypes.InstrumentationType.GPU_Events:
             idstr = 'e' + self._idstr(cfg, state, entry_node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
             outer_stream.write(self._report('%s %s' % (type(s).__name__, s.label), cfg, state, entry_node), cfg,
                                state_id, node)
@@ -153,7 +153,7 @@ def on_node_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no
         if node.instrument == dtypes.InstrumentationType.GPU_Events:
             state_id = state.parent_graph.node_id(state)
             idstr = 'b' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
 
     def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node,
@@ -165,7 +165,46 @@ def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node
         if node.instrument == dtypes.InstrumentationType.GPU_Events:
             state_id = state.parent_graph.node_id(state)
             idstr = 'e' + self._idstr(cfg, state, node)
-            stream = getattr(node, '_cuda_stream', -1)
+            stream = self._get_gpu_stream(state, node)
             outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node)
             outer_stream.write(self._report('%s %s' % (type(node).__name__, node.label), cfg, state, node), cfg,
                                state_id, node)
+
+    def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int:
+        """
+        Return the GPU stream ID assigned to a given node.
+
+        - In the CUDACodeGen, the stream ID is stored as the private attribute
+          ``_cuda_stream`` on the node.
+        - In the ExperimentalCUDACodeGen, streams are explicitly assigned to tasklets
+          and GPU_Device-scheduled maps (kernels) via a GPU stream AccessNode. For
+          other node types, no reliable stream assignment is available.
+
+        Parameters
+        ----------
+        state : SDFGState
+            The state containing the node.
+        node : dace.sdfg.nodes.Node
+            The node for which to query the GPU stream.
+
+        Returns
+        -------
+        int
+            The assigned GPU stream ID, or ``-1`` if none could be determined.
+        """
+        if config.Config.get('compiler', 'cuda', 'implementation') == 'legacy':
+            stream = getattr(node, '_cuda_stream', -1)
+
+        else:
+            stream = -1
+            for in_edge in state.in_edges(node):
+                src = in_edge.src
+                if (isinstance(src, nodes.AccessNode) and src.desc(state).dtype == dtypes.gpuStream_t):
+                    stream = int(in_edge.data.subset)
+
+            for out_edge in state.out_edges(node):
+                dst = out_edge.dst
+                if (isinstance(dst, nodes.AccessNode) and dst.desc(state).dtype == dtypes.gpuStream_t):
+                    stream = int(out_edge.data.subset)
+
+        return stream
diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py
@@ -9,3 +9,4 @@
 from .mlir.mlir import MLIRCodeGen
 from .sve.codegen import SVECodeGen
 from .snitch import SnitchCodeGen
+from .experimental_cuda import ExperimentalCUDACodeGen
diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
@@ -236,14 +236,22 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher',
 
 def is_cuda_codegen_in_device(framecode) -> bool:
     """
-    Check the state of the CUDA code generator, whether it is inside device code.
+    Check the state of the (Experimental) CUDA code generator, whether it is inside device code.
     """
     from dace.codegen.targets.cuda import CUDACodeGen
+    from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen
+
+    cuda_impl = Config.get('compiler', 'cuda', 'implementation')
+    if cuda_impl == 'legacy':
+        cudaClass = CUDACodeGen
+    elif cuda_impl == 'experimental':
+        cudaClass = ExperimentalCUDACodeGen
+
     if framecode is None:
         cuda_codegen_in_device = False
     else:
         for codegen in framecode.targets:
-            if isinstance(codegen, CUDACodeGen):
+            if isinstance(codegen, cudaClass):
                 cuda_codegen_in_device = codegen._in_device_code
                 break
         else:
@@ -266,11 +274,9 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str:
         root = name.split('.')[0]
         if root in sdfg.arrays and isinstance(sdfg.arrays[root], data.Structure):
             name = name.replace('.', '->')
-
     # Special case: If memory is persistent and defined in this SDFG, add state
     # struct to name
     if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)):
-
         if desc.storage == dtypes.StorageType.CPU_ThreadLocal:  # Use unambiguous name for thread-local arrays
             return f'__{sdfg.cfg_id}_{name}'
         elif not is_cuda_codegen_in_device(framecode):  # GPU kernels cannot access state
@@ -936,7 +942,7 @@ def unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, callsite_st
         # set the stream to a local variable.
         max_streams = int(Config.get("compiler", "cuda", "max_concurrent_streams"))
         if not is_devicelevel_gpu(sdfg, state_dfg, node) and (hasattr(node, "_cuda_stream")
-                                                              or connected_to_gpu_memory(node, state_dfg, sdfg)):
+                                                              and connected_to_gpu_memory(node, state_dfg, sdfg)):
             if max_streams >= 0:
                 callsite_stream.write(
                     'int __dace_current_stream_id = %d;\n%sStream_t __dace_current_stream = __state->gpu_context->streams[__dace_current_stream_id];'

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
@@ -513,6 +513,13 @@ def allocate_array(self,
 
             return
         elif (nodedesc.storage == dtypes.StorageType.Register):
+
+            if nodedesc.dtype == dtypes.gpuStream_t:
+                ctype = dtypes.gpuStream_t.ctype
+                allocation_stream.write(f"{ctype}* {name} = __state->gpu_context->streams;")
+                define_var(name, DefinedType.Pointer, ctype)
+                return
+
             ctypedef = dtypes.pointer(nodedesc.dtype).ctype
             if nodedesc.start_offset != 0:
                 raise NotImplementedError('Start offset unsupported for registers')
@@ -588,6 +595,9 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap
 
         if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)):
             return
+        elif nodedesc.dtype == dtypes.gpuStream_t:
+            callsite_stream.write(f"{alloc_name} = nullptr;")
+            return
         elif (nodedesc.storage == dtypes.StorageType.CPU_Heap
               or (nodedesc.storage == dtypes.StorageType.Register and
                   (symbolic.issymbolic(arrsize, sdfg.constants) or
@@ -1008,6 +1018,11 @@ def process_out_memlets(self,
             dst_edge = dfg.memlet_path(edge)[-1]
             dst_node = dst_edge.dst
 
+            if isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state).dtype == dtypes.gpuStream_t:
+                # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks
+                # Thus, nothing needs to be written and out memlets of this kind should be ignored.
+                continue
+
             # Target is neither a data nor a tasklet node
             if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode)
                                                        and not isinstance(dst_node, nodes.CodeNode)):
@@ -1049,6 +1064,7 @@ def process_out_memlets(self,
             # Tasklet -> array with a memlet. Writing to array is emitted only if the memlet is not empty
             if isinstance(node, nodes.CodeNode) and not edge.data.is_empty():
                 if not uconn:
+                    return
                     raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format(
                         str(edge.src), str(edge.dst)))
 
@@ -1585,6 +1601,10 @@ def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: State
         cdtype = src_node.out_connectors[edge.src_conn]
         if isinstance(sdfg.arrays[edge.data.data], data.Stream):
             pass
+        elif isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state_dfg).dtype == dtypes.gpuStream_t:
+            # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks
+            # Thus, nothing needs to be written.
+            pass
         elif isinstance(cdtype, dtypes.pointer):  # If pointer, also point to output
             desc = sdfg.arrays[edge.data.data]