diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt index 5a8e6438eb..ce71cda1a6 100644 --- a/dace/codegen/CMakeLists.txt +++ b/dace/codegen/CMakeLists.txt @@ -58,7 +58,8 @@ foreach(DACE_FILE ${DACE_FILES}) # Make the path absolute set(DACE_FILE ${DACE_SRC_DIR}/${DACE_FILE}) # Now treat the file according to the deduced target - if(${DACE_FILE_TARGET} STREQUAL "cuda") + # previous: if(${DACE_FILE_TARGET} STREQUAL "cuda"). Needed to work with experimental + if(${DACE_FILE_TARGET} STREQUAL "experimental_cuda" OR ${DACE_FILE_TARGET} STREQUAL "cuda") if(${DACE_FILE_TARGET_TYPE} MATCHES "hip") set(DACE_ENABLE_HIP ON) set(DACE_HIP_FILES ${DACE_HIP_FILES} ${DACE_FILE}) diff --git a/dace/codegen/instrumentation/gpu_events.py b/dace/codegen/instrumentation/gpu_events.py index 9c653342cd..99a91e3b3f 100644 --- a/dace/codegen/instrumentation/gpu_events.py +++ b/dace/codegen/instrumentation/gpu_events.py @@ -129,7 +129,7 @@ def on_scope_entry(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, n 'GPU_Device map scopes') idstr = 'b' + self._idstr(cfg, state, node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.ExitNode, @@ -139,7 +139,7 @@ def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no s = self._get_sobj(node) if s.instrument == dtypes.InstrumentationType.GPU_Events: idstr = 'e' + self._idstr(cfg, state, entry_node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) outer_stream.write(self._report('%s %s' % (type(s).__name__, s.label), cfg, state, entry_node), cfg, state_id, node) @@ -153,7 +153,7 @@ def on_node_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no if node.instrument == dtypes.InstrumentationType.GPU_Events: state_id = state.parent_graph.node_id(state) idstr = 'b' + self._idstr(cfg, state, node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node, @@ -165,7 +165,46 @@ def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node if node.instrument == dtypes.InstrumentationType.GPU_Events: state_id = state.parent_graph.node_id(state) idstr = 'e' + self._idstr(cfg, state, node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) outer_stream.write(self._report('%s %s' % (type(node).__name__, node.label), cfg, state, node), cfg, state_id, node) + + def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int: + """ + Return the GPU stream ID assigned to a given node. + + - In the CUDACodeGen, the stream ID is stored as the private attribute + ``_cuda_stream`` on the node. + - In the ExperimentalCUDACodeGen, streams are explicitly assigned to tasklets + and GPU_Device-scheduled maps (kernels) via a GPU stream AccessNode. For + other node types, no reliable stream assignment is available. + + Parameters + ---------- + state : SDFGState + The state containing the node. + node : dace.sdfg.nodes.Node + The node for which to query the GPU stream. + + Returns + ------- + int + The assigned GPU stream ID, or ``-1`` if none could be determined. + """ + if config.Config.get('compiler', 'cuda', 'implementation') == 'legacy': + stream = getattr(node, '_cuda_stream', -1) + + else: + stream = -1 + for in_edge in state.in_edges(node): + src = in_edge.src + if (isinstance(src, nodes.AccessNode) and src.desc(state).dtype == dtypes.gpuStream_t): + stream = int(in_edge.data.subset) + + for out_edge in state.out_edges(node): + dst = out_edge.dst + if (isinstance(dst, nodes.AccessNode) and dst.desc(state).dtype == dtypes.gpuStream_t): + stream = int(out_edge.data.subset) + + return stream diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index cd4d5f957f..5c9027e68e 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,3 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen +from .experimental_cuda import ExperimentalCUDACodeGen diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index b451668831..12f09ba42c 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -236,14 +236,22 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher', def is_cuda_codegen_in_device(framecode) -> bool: """ - Check the state of the CUDA code generator, whether it is inside device code. + Check the state of the (Experimental) CUDA code generator, whether it is inside device code. """ from dace.codegen.targets.cuda import CUDACodeGen + from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen + + cuda_impl = Config.get('compiler', 'cuda', 'implementation') + if cuda_impl == 'legacy': + cudaClass = CUDACodeGen + elif cuda_impl == 'experimental': + cudaClass = ExperimentalCUDACodeGen + if framecode is None: cuda_codegen_in_device = False else: for codegen in framecode.targets: - if isinstance(codegen, CUDACodeGen): + if isinstance(codegen, cudaClass): cuda_codegen_in_device = codegen._in_device_code break else: @@ -266,11 +274,9 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: root = name.split('.')[0] if root in sdfg.arrays and isinstance(sdfg.arrays[root], data.Structure): name = name.replace('.', '->') - # Special case: If memory is persistent and defined in this SDFG, add state # struct to name if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): - if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays return f'__{sdfg.cfg_id}_{name}' elif not is_cuda_codegen_in_device(framecode): # GPU kernels cannot access state @@ -936,7 +942,7 @@ def unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, callsite_st # set the stream to a local variable. max_streams = int(Config.get("compiler", "cuda", "max_concurrent_streams")) if not is_devicelevel_gpu(sdfg, state_dfg, node) and (hasattr(node, "_cuda_stream") - or connected_to_gpu_memory(node, state_dfg, sdfg)): + and connected_to_gpu_memory(node, state_dfg, sdfg)): if max_streams >= 0: callsite_stream.write( 'int __dace_current_stream_id = %d;\n%sStream_t __dace_current_stream = __state->gpu_context->streams[__dace_current_stream_id];' diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 5e71cbb074..228613bae7 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -513,6 +513,13 @@ def allocate_array(self, return elif (nodedesc.storage == dtypes.StorageType.Register): + + if nodedesc.dtype == dtypes.gpuStream_t: + ctype = dtypes.gpuStream_t.ctype + allocation_stream.write(f"{ctype}* {name} = __state->gpu_context->streams;") + define_var(name, DefinedType.Pointer, ctype) + return + ctypedef = dtypes.pointer(nodedesc.dtype).ctype if nodedesc.start_offset != 0: raise NotImplementedError('Start offset unsupported for registers') @@ -588,6 +595,9 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)): return + elif nodedesc.dtype == dtypes.gpuStream_t: + callsite_stream.write(f"{alloc_name} = nullptr;") + return elif (nodedesc.storage == dtypes.StorageType.CPU_Heap or (nodedesc.storage == dtypes.StorageType.Register and (symbolic.issymbolic(arrsize, sdfg.constants) or @@ -1008,6 +1018,11 @@ def process_out_memlets(self, dst_edge = dfg.memlet_path(edge)[-1] dst_node = dst_edge.dst + if isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state).dtype == dtypes.gpuStream_t: + # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks + # Thus, nothing needs to be written and out memlets of this kind should be ignored. + continue + # Target is neither a data nor a tasklet node if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode) and not isinstance(dst_node, nodes.CodeNode)): @@ -1049,6 +1064,7 @@ def process_out_memlets(self, # Tasklet -> array with a memlet. Writing to array is emitted only if the memlet is not empty if isinstance(node, nodes.CodeNode) and not edge.data.is_empty(): if not uconn: + return raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format( str(edge.src), str(edge.dst))) @@ -1585,6 +1601,10 @@ def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: State cdtype = src_node.out_connectors[edge.src_conn] if isinstance(sdfg.arrays[edge.data.data], data.Stream): pass + elif isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state_dfg).dtype == dtypes.gpuStream_t: + # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks + # Thus, nothing needs to be written. + pass elif isinstance(cdtype, dtypes.pointer): # If pointer, also point to output desc = sdfg.arrays[edge.data.data] diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py new file mode 100644 index 0000000000..0d3dce577c --- /dev/null +++ b/dace/codegen/targets/experimental_cuda.py @@ -0,0 +1,1552 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union +import networkx as nx + +import dace +from dace import data as dt, Memlet +from dace import dtypes, registry, symbolic, subsets +from dace.config import Config +from dace.sdfg import SDFG, ScopeSubgraphView, SDFGState, nodes +from dace.sdfg import utils as sdutil +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView + +from dace.codegen import common +from dace.codegen.codeobject import CodeObject +from dace.codegen.dispatcher import DefinedType, TargetDispatcher +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.common import update_persistent_desc +from dace.codegen.targets.cpp import (codeblock_to_cpp, memlet_copy_to_absolute_strides, mangle_dace_state_struct_name, + ptr, sym2cpp) +from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute + +# DaCe transformation imports +from dace.transformation.passes import analysis as ap +from dace.transformation.pass_pipeline import Pipeline +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets +from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets +from dace.transformation.passes.gpustream.gpu_stream_topology_simplification import GPUStreamTopologySimplification +from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync +from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap +from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize + +# Experimental CUDA helper imports +from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import generate_sync_debug_call, get_defined_type + +from dace.codegen.targets import cpp + +# Type checking imports (conditional) +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator + from dace.codegen.targets.cpu import CPUCodeGen + + +@registry.autoregister_params(name='experimental_cuda') +class ExperimentalCUDACodeGen(TargetCodeGenerator): + """ Experimental CUDA code generator.""" + target_name = 'experimental_cuda' + title = 'CUDA' + + ########################################################################### + # Initialization & Preprocessing + + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): + + self._frame: DaCeCodeGenerator = frame_codegen # creates the frame code, orchestrates the code generation for targets + self._dispatcher: TargetDispatcher = frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target + + self._in_device_code = False + self._cpu_codegen: Optional['CPUCodeGen'] = None + + # NOTE: Moved from preprossessing to here + self.backend: str = common.get_gpu_backend() + self.language = 'cu' if self.backend == 'cuda' else 'cpp' + target_type = '' if self.backend == 'cuda' else self.backend + self._codeobject = CodeObject(sdfg.name + '_' + 'cuda', + '', + self.language, + ExperimentalCUDACodeGen, + 'CUDA', + target_type=target_type) + + self._localcode = CodeIOStream() + self._globalcode = CodeIOStream() + + # TODO: init and exitcode seem to serve no purpose actually. + self._initcode = CodeIOStream() + self._exitcode = CodeIOStream() + + self._global_sdfg: SDFG = sdfg + self._toplevel_schedule = None + + # Positions at which to deallocate memory pool arrays + self.pool_release: Dict[Tuple[SDFG, str], Tuple[SDFGState, Set[nodes.Node]]] = {} + self.has_pool = False + + # INFO: + # Register GPU schedules and storage types for ExperimentalCUDACodeGen. + # The dispatcher maps GPU-related schedules and storage types to the + # appropriate code generation functions in this code generator. + + # Register dispatchers + self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() + + self._dispatcher = frame_codegen.dispatcher + self._dispatcher.register_map_dispatcher(dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN, self) + self._dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) + self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + + # TODO: Add this to dtypes as well + gpu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned] + + self._dispatcher.register_array_dispatcher(gpu_storage, self) + self._dispatcher.register_array_dispatcher(dtypes.StorageType.CPU_Pinned, self) + for storage in gpu_storage: + for other_storage in dtypes.StorageType: + self._dispatcher.register_copy_dispatcher(storage, other_storage, None, self) + self._dispatcher.register_copy_dispatcher(other_storage, storage, None, self) + + # NOTE: + # "Register illegal copies" code NOT copied from cuda.py + # Was never needed. + + ################## New variables ########################## + + self._current_kernel_spec: Optional[KernelSpec] = None + self._gpu_stream_manager: Optional[GPUStreamManager] = None + self._kernel_dimensions_map: Set[nodes.MapEntry] = set() + self._kernel_arglists: Dict[nodes.MapEntry, Dict[str, dt.Data]] = {} + + def preprocess(self, sdfg: SDFG) -> None: + """ + Preprocess the SDFG to prepare it for GPU code generation. This includes: + - Handling GPU<->GPU strided copies. + - Adding explicit ThreadBlock Maps where missing and infer Grid and Block dimensions for + every Kernel in the SDFG + - Runs a pipeline for making GPU stream explicit at the SDFG level and handles other + GPU stream related initialization. + - TODO + - Handling memory pool management + + Note that the order of the steps matters, e.g. TODO + """ + + #------------------------- Hanlde GPU<->GPU strided copies -------------------------- + + # Find GPU<->GPU strided copies that cannot be represented by a single copy command + from dace.transformation.dataflow import CopyToMap + for e, state in list(sdfg.all_edges_recursive()): + if isinstance(e.src, nodes.AccessNode) and isinstance(e.dst, nodes.AccessNode): + nsdfg = state.parent + if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global + and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global): + copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides( + None, nsdfg, state, e, e.src, e.dst) + dims = len(copy_shape) + + # Skip supported copy types + if dims == 1: + continue + elif dims == 2: + if src_strides[-1] != 1 or dst_strides[-1] != 1: + # NOTE: Special case of continuous copy + # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] + # with copy shape [I, J] and strides [J*K, K], [J, 1] + try: + is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] + is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] + except (TypeError, ValueError): + is_src_cont = False + is_dst_cont = False + if is_src_cont and is_dst_cont: + continue + else: + continue + elif dims > 2: + if not (src_strides[-1] != 1 or dst_strides[-1] != 1): + continue + + # Turn unsupported copy to a map + try: + CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst) + except ValueError: # If transformation doesn't match, continue normally + continue + + #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- + + # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion + # of ThreadBlock maps. Note: the original Kernel Entry was transformed into a ThreadBlock map, + # and a new GPU_Device (i.e., Kernel) map was inserted on top of it. + old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) + + # Insert default explicit GPU_ThreadBlock maps where they are missing + sdfg.apply_transformations_once_everywhere(AddThreadBlockMap) + + new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes + kernels_with_added_tb_maps = { + n + for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device + } + + # Infer GPU Grid and Block dimensions + self._kernel_dimensions_map = InferGPUGridAndBlockSize().apply_pass(sdfg, kernels_with_added_tb_maps) + + #------------------------- GPU Stream related Logic -------------------------- + + # Register GPU context in state struct + self._frame.statestruct.append('dace::cuda::Context *gpu_context;') + + # Prepare the Pipeline to make GPU streams explicit: Add and connect SDFG nodes + # with GPU stream AccessNodes where used + stream_pipeline = Pipeline([ + NaiveGPUStreamScheduler(), + InsertGPUStreamsToSDFGs(), + InsertGPUStreamsToKernels(), + InsertGPUStreamsToTasklets(), + InsertGPUStreamSyncTasklets(), + InsertGPUCopyTasklets(), + GPUStreamTopologySimplification(), + ]) + + # TODO: Missed copies due to InsertGPUCopyTasklet -> maybe check wheter copies were + # handled above than just adding this codegen to used_targets by default + self._dispatcher._used_targets.add(self) + gpustream_assignments = stream_pipeline.apply_pass(sdfg, {})['NaiveGPUStreamScheduler'] + + # Initialize runtime GPU stream manager + self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments) + + #----------------- Shared Memory Synchronization related Logic ----------------- + + auto_sync = Config.get('compiler', 'cuda', 'auto_syncthreads_insertion') + if auto_sync: + DefaultSharedMemorySync().apply_pass(sdfg, None) + + #------------------------- Memory Pool related Logic -------------------------- + + # Find points where memory should be released to the memory pool + self._compute_pool_release(sdfg) + + # Retrieve arguments required for the kernels subgraph + shared_transients = {} + for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): + if (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device): + if state.parent not in shared_transients: + shared_transients[state.parent] = state.parent.shared_transients() + self._kernel_arglists[node] = state.scope_subgraph(node).arglist(defined_syms, + shared_transients[state.parent]) + + def _compute_pool_release(self, top_sdfg: SDFG): + """ + Computes positions in the code generator where a memory pool array is no longer used and + ``backendFreeAsync`` should be called to release it. + + :param top_sdfg: The top-level SDFG to traverse. + :raises ValueError: If the backend does not support memory pools. + """ + # Find release points for every array in every SDFG + reachability = access_nodes = None + for sdfg in top_sdfg.all_sdfgs_recursive(): + # Skip SDFGs without memory pool hints + pooled = set(aname for aname, arr in sdfg.arrays.items() + if getattr(arr, 'pool', False) is True and arr.transient) + if not pooled: + continue + self.has_pool = True + if self.backend != 'cuda': + raise ValueError(f'Backend "{self.backend}" does not support the memory pool allocation hint') + + # Keep only global arrays + pooled = filter( + lambda aname: sdfg.arrays[aname].lifetime in + (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime. + External), pooled) + + # Lazily compute reachability and access nodes + if reachability is None: + reachability = ap.StateReachability().apply_pass(top_sdfg, {}) + access_nodes = ap.FindAccessStates().apply_pass(top_sdfg, {}) + + reachable = reachability[sdfg.cfg_id] + access_sets = access_nodes[sdfg.cfg_id] + for state in sdfg.states(): + # Find all data descriptors that will no longer be used after this state + last_state_arrays: Set[str] = set( + s for s in access_sets + if s in pooled and state in access_sets[s] and not (access_sets[s] & reachable[state]) - {state}) + + anodes = list(state.data_nodes()) + for aname in last_state_arrays: + # Find out if there is a common descendant access node. + # If not, release at end of state + ans = [an for an in anodes if an.data == aname] + terminator = None + for an1 in ans: + if all(nx.has_path(state.nx, an2, an1) for an2 in ans if an2 is not an1): + terminator = an1 + break + + # Old logic below, now we use the gpu_stream manager which returns nullptr automatically + # to all nodes thatdid not got assigned a cuda stream + """ + # Enforce a cuda_stream field so that the state-wide deallocation would work + if not hasattr(an1, '_cuda_stream'): + an1._cuda_stream = 'nullptr' + """ + + # If access node was found, find the point where all its reads are complete + terminators = set() + if terminator is not None: + parent = state.entry_node(terminator) + # If within a scope, once all memlet paths going out of that scope are complete, + # it is time to release the memory + if parent is not None: + # Just to be safe, release at end of state (e.g., if misused in Sequential map) + terminators = set() + else: + # Otherwise, find common descendant (or end of state) following the ends of + # all memlet paths (e.g., (a)->...->[tasklet]-->...->(b)) + for e in state.out_edges(terminator): + if isinstance(e.dst, nodes.EntryNode): + terminators.add(state.exit_node(e.dst)) + else: + terminators.add(e.dst) + # After all outgoing memlets of all the terminators have been processed, memory + # will be released + + self.pool_release[(sdfg, aname)] = (state, terminators) + + # If there is unfreed pooled memory, free at the end of the SDFG + unfreed = set(arr for arr in pooled if (sdfg, arr) not in self.pool_release) + if unfreed: + # Find or make single sink node + sinks = sdfg.sink_nodes() + if len(sinks) == 1: + sink = sinks[0] + elif len(sinks) > 1: + sink = sdfg.add_state() + for s in sinks: + sdfg.add_edge(s, sink) + else: # len(sinks) == 0: + raise ValueError('End state not found when trying to free pooled memory') + + # Add sink as terminator state + for arr in unfreed: + self.pool_release[(sdfg, arr)] = (sink, set()) + + ########################################################################### + # Determine wheter initializer and finalizer should be called + + @property + def has_initializer(self) -> bool: + return True + + @property + def has_finalizer(self) -> bool: + return True + + ########################################################################### + # Scope generation + + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + # Import strategies here to avoid circular dependencies + from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import (ScopeGenerationStrategy, + KernelScopeGenerator, + ThreadBlockScopeGenerator, + WarpScopeGenerator) + # Entry Node of the scope + scope_entry = dfg_scope.source_nodes()[0] + + #--------------- Start of Kernel Function Code Generation -------------------- + + if not self._in_device_code: + + # Enter kernel context and recursively generate device code + + state = cfg.state(state_id) + scope_entry = dfg_scope.source_nodes()[0] + scope_exit = dfg_scope.sink_nodes()[0] + scope_entry_stream = CodeIOStream() + scope_exit_stream = CodeIOStream() + + # Instrumentation for kernel scope + instr = self._dispatcher.instrumentation[scope_entry.map.instrument] + if instr is not None: + instr.on_scope_entry(sdfg, cfg, state, scope_entry, callsite_stream, scope_entry_stream, + self._globalcode) + outer_stream = CodeIOStream() + instr.on_scope_exit(sdfg, cfg, state, scope_exit, outer_stream, scope_exit_stream, self._globalcode) + + # New scope for defined variables (kernel functions scope) + self._dispatcher.defined_vars.enter_scope(scope_entry) + + # Store kernel metadata (name, dimensions, arguments, etc.) in a KernelSpec object + # and save it as an attribute + kernel_spec = KernelSpec(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id) + + self._current_kernel_spec = kernel_spec + + # (Re)define variables for the new scope + self._define_variables_in_kernel_scope(sdfg, self._dispatcher) + + # declare and call kernel wrapper function (in the CPU-side code) + self._declare_and_invoke_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + # Recursively generate GPU code into the kernel_stream (will be in a .cu file) + kernel_stream = CodeIOStream() + kernel_function_stream = self._globalcode + + self._in_device_code = True + + kernel_scope_generator = KernelScopeGenerator(codegen=self) + if kernel_scope_generator.applicable(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream): + kernel_scope_generator.generate(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream) + else: + raise ValueError("Invalid kernel configuration: This strategy is only applicable if the " + "outermost GPU schedule is of type GPU_Device (most likely cause).") + + self._localcode.write(scope_entry_stream.getvalue()) + + # Append generated kernel code to localcode + self._localcode.write(kernel_stream.getvalue() + '\n') + + self._localcode.write(scope_exit_stream.getvalue()) + + # Exit kernel context + self._in_device_code = False + + # Generate kernel wrapper, i.e. function which will launch the kernel + self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + # Exit scope for defined variables + self._dispatcher.defined_vars.exit_scope(scope_entry) + + if instr is not None: + callsite_stream.write(outer_stream.getvalue()) + + return + + import copy + from dace.transformation.passes.fix_test import Fix + from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel + from dace.sdfg import infer_types + + names = Fix().apply_pass(sdfg, {}) + for name, map_parent in names.items(): + MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name) + infer_types.infer_connector_types(sdfg) + + #--------------- Nested GPU Scope -------------------- + supported_strategies: List[ScopeGenerationStrategy] = [ + ThreadBlockScopeGenerator(codegen=self), + WarpScopeGenerator(codegen=self) + ] + + for strategy in supported_strategies: + if strategy.applicable(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream): + strategy.generate(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + return + + #--------------- Unsupported Cases -------------------- + # Note: We are inside a nested GPU scope at this point. + + schedule_type = scope_entry.map.schedule + + if schedule_type == dace.ScheduleType.GPU_Device: + raise NotImplementedError("Dynamic parallelism (nested GPU_Device schedules) is not supported.") + + raise NotImplementedError( + f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " + "Please check for supported schedule types or implement the corresponding strategy.") + + def _define_variables_in_kernel_scope(self, sdfg: SDFG, dispatcher: TargetDispatcher): + """ + Define kernel-visible variables in the dispatcher's scope. + + - Certain variables stored in the host-side ``__state`` struct (e.g., persistent or external + data) cannot be accessed directly in kernel code. They are passed as arguments instead, with + pointer names resolved via ``cpp.ptr(..)``. These must be registered in the dispatcher for use + in kernel context. + + - KernelSpec may also mark certain variables/arguments as constants, which must be registered with + the appropriate ``const`` qualifier in their ctype. + """ + # Extract argument and constant definitions from the KernelSpec + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_constants: Set[str] = kernel_spec.kernel_constants + kernel_arglist: Dict[str, dt.Data] = kernel_spec.arglist + + # Save current in_device_code value for restoration later + restore_in_device_code = self._in_device_code + for name, data_desc in kernel_arglist.items(): + + # Only arrays relevant + if not name in sdfg.arrays: + continue + + data_desc = sdfg.arrays[name] + # Get the outer/host pointer name + self._in_device_code = False + host_ptrname = cpp.ptr(name, data_desc, sdfg, self._frame) + + # Get defined type and ctype for the data (use host pointer name) + is_global: bool = data_desc.lifetime in (dtypes.AllocationLifetime.Global, + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) + defined_type, ctype = dispatcher.defined_vars.get(host_ptrname, is_global=is_global) + + # Get the inner/device pointer name + self._in_device_code = True + device_ptrname = cpp.ptr(name, data_desc, sdfg, self._frame) + + # Add the const qualifier if it is a constant AND is not marked as such yet + if name in kernel_constants: + if not "const " in ctype: + ctype = f"const {ctype}" + + # Register variable with the device pointer name for the kernel context + dispatcher.defined_vars.add(device_ptrname, defined_type, ctype, allow_shadowing=True) + + # Restore in_device_code field + self._in_device_code = restore_in_device_code + + def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + + scope_entry = dfg_scope.source_nodes()[0] + + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_name = kernel_spec.kernel_name + kernel_wrapper_args_as_input = kernel_spec.kernel_wrapper_args_as_input + kernel_wrapper_args_typed = kernel_spec.kernel_wrapper_args_typed + + # Declaration of the kernel wrapper function (in the CPU-side code) + function_stream.write( + 'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg, + state_id, scope_entry) + + # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. + state = cfg.state(state_id) + if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): + callsite_stream.write('{', cfg, state_id, scope_entry) + + # Synchronize all events leading to dynamic map range connectors + for e in dace.sdfg.dynamic_map_inputs(state, scope_entry): + callsite_stream.write( + self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), + cfg, state_id, scope_entry) + + # Calling the kernel wrapper function (in the CPU-side code) + callsite_stream.write('__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_as_input)), + cfg, state_id, scope_entry) + + # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. + if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): + callsite_stream.write('}', cfg, state_id, scope_entry) + + def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + scope_entry = dfg_scope.source_nodes()[0] + + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_name = kernel_spec.kernel_name + kernel_args_as_input = kernel_spec.args_as_input + kernel_launch_args_typed = kernel_spec.kernel_wrapper_args_typed + + # get kernel dimensions and transform into a c++ string + grid_dims = kernel_spec.grid_dims + block_dims = kernel_spec.block_dims + gdims = ', '.join(sym2cpp(grid_dims)) + bdims = ', '.join(sym2cpp(block_dims)) + + # ----------------- Kernel Launch Function Declaration ----------------------- + + self._localcode.write( + f""" + DACE_EXPORTED void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)}); + void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)}) + """, cfg, state_id, scope_entry) + + # Open bracket + self._localcode.write('{', cfg, state_id, scope_entry) + + # ----------------- Guard Checks handling ----------------------- + + # Ensure that iteration space is neither empty nor negative sized + single_dimchecks = [] + for gdim in grid_dims: + # Only emit a guard if we can't statically prove gdim > 0 + if (gdim > 0) != True: + single_dimchecks.append(f'(({sym2cpp(gdim)}) <= 0)') + + dimcheck = ' || '.join(single_dimchecks) + + if dimcheck: + emptygrid_warning = '' + if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'): + emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" ' + 'due to an empty grid.\\n");') + + self._localcode.write( + f''' + if ({dimcheck}) {{ + {emptygrid_warning} + return; + }}''', cfg, state_id, scope_entry) + + # ----------------- Kernel Launch Invocation ----------------------- + stream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + kargs = ', '.join(['(void *)&' + arg for arg in kernel_args_as_input]) + self._localcode.write( + f''' + void *{kernel_name}_args[] = {{ {kargs} }}; + gpuError_t __err = {self.backend}LaunchKernel( + (void*){kernel_name}, dim3({gdims}), dim3({bdims}), {kernel_name}_args, {0}, {stream_var_name} + ); + ''', cfg, state_id, scope_entry) + + self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});\n') + self._localcode.write(generate_sync_debug_call()) + + # Close bracket + self._localcode.write('}', cfg, state_id, scope_entry) + + ########################################################################### + # Generation of Memory Copy Logic + + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], + edge: Tuple[nodes.Node, str, nodes.Node, str, + Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import (CopyContext, + OutOfKernelCopyStrategy, + SyncCollaboritveGPUCopyStrategy) + + context = CopyContext(sdfg, cfg.state(state_id), src_node, dst_node, edge, + self._gpu_stream_manager.gpustream_assignments) + + if OutOfKernelCopyStrategy().applicable(context): + # Handled during the GPU stream pipeline in preprocess() + # in form of explicit tasklets + return + + elif SyncCollaboritveGPUCopyStrategy().applicable(context): + code = SyncCollaboritveGPUCopyStrategy().generate_copy(context, self._kernel_dimensions_map) + callsite_stream.write(code, cfg, state_id, [src_node, dst_node]) + else: + # Fallback + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + + ############################################################################# + # Predicates for Dispatcher + + def state_dispatch_predicate(self, sdfg, state): + """ + Determines whether a given state should be processed by this + code generator (`ExperimentalCUDACodeGen`). + + Returns True if either: + 1. The state has associated GPU memory that needs to be released + (i.e., it appears in `self.pool_release`), or + 2. The code generator is currently generating device/kernel code. + """ + return any(s is state for s, _ in self.pool_release.values()) or self._in_device_code + + def node_dispatch_predicate(self, sdfg, state, node): + """ + Determines whether a node should be handled by this + code generator (`ExperimentalCUDACodeGen`). + + Returns True if: + - The node has a GPU schedule handled by this backend, or + - The generator is currently generating kernel code. + """ + schedule = getattr(node, 'schedule', None) + + if schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return True + + if self._in_device_code: + return True + + return False + + ############################################################################# + # Nested SDFGs & tasklets + + def generate_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + generate_state_footer: bool = False) -> None: + + # User frame code to generate state + self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) + + # Special: Release of pooled memory if not in device code that need to be released her + if not self._in_device_code: + + handled_keys = set() + backend = self.backend + for (pool_sdfg, name), (pool_state, _) in self.pool_release.items(): + + if (pool_sdfg is not sdfg) or (pool_state is not state): + continue + + data_descriptor = pool_sdfg.arrays[name] + ptrname = ptr(name, data_descriptor, pool_sdfg, self._frame) + + # Adjust if there is an offset + if isinstance(data_descriptor, dt.Array) and data_descriptor.start_offset != 0: + ptrname = f'({ptrname} - {sym2cpp(data_descriptor.start_offset)})' + + # Free the memory + callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', pool_sdfg) + callsite_stream.write(generate_sync_debug_call()) + + # We handled the key (pool_sdfg, name) and can remove it later + handled_keys.add((pool_sdfg, name)) + + # Delete the handled keys here (not in the for loop, which would cause issues) + for key in handled_keys: + del self.pool_release[key] + + # Invoke all instrumentation providers + for instr in self._frame._dispatcher.instrumentation.values(): + if instr is not None: + instr.on_state_end(sdfg, cfg, state, callsite_stream, function_stream) + + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + # get the generating function's name + gen = getattr(self, '_generate_' + type(node).__name__, False) + + # if it is not implemented, use generate node of cpu impl + if gen is not False: + gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + elif type(node).__name__ == 'MapExit' and node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + # Special case: It is a MapExit but from a GPU_schedule- the MapExit is already + # handled by a KernelScopeManager instance. Otherwise cpu_codegen will close it + return + else: + self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label): + return 'DACE_DFI ' + self._cpu_codegen.generate_nsdfg_header( + sdfg, cfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False) + + def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label): + return self._cpu_codegen.generate_nsdfg_call(sdfg, + cfg, + state, + node, + memlet_references, + sdfg_label, + state_struct=False) + + def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): + args = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) + return args + + def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.NestedSDFG, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + old_schedule = self._toplevel_schedule + self._toplevel_schedule = node.schedule + old_codegen = self._cpu_codegen.calling_codegen + self._cpu_codegen.calling_codegen = self + + # Determine and update ctype of new constant data and symbols within the NSDFG + parent_state: SDFGState = cfg.state(state_id) + nsdfg = node.sdfg + + # New scope for defined variables + dispatcher: TargetDispatcher = self._dispatcher + dispatcher.defined_vars.enter_scope(node) + + # Add the const qualifier to any constants not marked as such + """ + # update const data + new_const_data = sdutil.get_constant_data(node, nsdfg) + for name in new_const_data: + desc = nsdfg.arrays[name] + ptr_name = ptr(name, desc, nsdfg, self._frame) + try: + defined_type, ctype = dispatcher.defined_vars.get(ptr_name, is_global=True) + except: + defined_type = get_defined_type(desc) + if defined_type == DefinedType.Pointer: + ctype = f'{desc.ctype} *' + elif defined_type == DefinedType.Scalar: + ctype = desc.ctype + else: + raise NotImplementedError("Not expected Type") + + if not "const " in ctype: + ctype = f"const {ctype}" + dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) + + # update const symbols + new_const_symbols = sdutil.get_constant_symbols(node, nsdfg) + for name in new_const_symbols: + defined_type = DefinedType.Scalar + if not "const" in nsdfg.symbols[name].ctype: + ctype = f"const {nsdfg.symbols[name].ctype}" + """ + + # Redirect rest to CPU codegen + self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + # Exit scope + dispatcher.defined_vars.exit_scope(node) + + self._cpu_codegen.calling_codegen = old_codegen + self._toplevel_schedule = old_schedule + + def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # import ScopeManager which opens and closes brackets for conditions, useful here + # because the location dictionary might prescribe which threads/blocks run tasklet code + from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import ScopeManager + + tasklet: nodes.Tasklet = node + with ScopeManager(self, sdfg, cfg, dfg, state_id, function_stream, callsite_stream, brackets_on_enter=False) as scope_manager: + + if 'gpu_thread' in tasklet.location: + name = 'gpu_thread' + index_expr = self._get_thread_id() + location: Union[int, str, subsets.Range] = tasklet.location[name] + cond = self._generate_condition_from_location(name, index_expr, location) + scope_manager.open(condition=cond) + + if 'gpu_warp' in tasklet.location: + name = 'gpu_warp' + index_expr = self._get_warp_id() + location: Union[int, str, subsets.Range] = tasklet.location[name] + cond = self._generate_condition_from_location(name, index_expr, location) + scope_manager.open(condition=cond) + + if 'gpu_block' in tasklet.location: + name = 'gpu_block' + index_expr = self._get_block_id() + location: Union[int, str, subsets.Range] = tasklet.location[name] + cond = self._generate_condition_from_location(name, index_expr, location) + scope_manager.open(condition=cond) + + # Call CPU codegen + self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def _generate_condition_from_location(self, name: str, index_expr: str, location: Union[int, str, + subsets.Range]) -> str: + + # 1. Normalize location + if isinstance(location, str) and ':' in location: + location = subsets.Range.from_string(location) + if len(location) != 1: + raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') + elif symbolic.issymbolic(location): + location = sym2cpp(location) + + # 2. Build condition + if isinstance(location, subsets.Range): + # Range of indices + begin, end, stride = location[0] + rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride) + cond = f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})' + if stride != 1: + cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)' + else: + # Single-element + cond = f'({index_expr}) == {location}' + + return cond + + def _get_thread_id(self) -> str: + kernel_block_dims: List = self._current_kernel_spec.block_dims + result = 'threadIdx.x' + if kernel_block_dims[1] != 1: + result += f' + ({sym2cpp(kernel_block_dims[0])}) * threadIdx.y' + if kernel_block_dims[2] != 1: + result += f' + ({sym2cpp(kernel_block_dims[0] * kernel_block_dims[1])}) * threadIdx.z' + return result + + def _get_warp_id(self) -> str: + return f'(({self._get_thread_id()}) / warpSize)' + + def _get_block_id(self) -> str: + kernel_block_dims: List = self._current_kernel_spec.block_dims + result = 'blockIdx.x' + if kernel_block_dims[1] != 1: + result += f' + gridDim.x * blockIdx.y' + if kernel_block_dims[2] != 1: + result += f' + gridDim.x * gridDim.y * blockIdx.z' + return result + + ####################################################################### + # Array Declaration, Allocation and Deallocation + + def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream) -> None: + + ptrname = ptr(node.data, nodedesc, sdfg, self._frame) + fsymbols = self._frame.symbols_and_constants(sdfg) + + # ----------------- Guard checks -------------------- + + # NOTE: `dfg` is None iff `nodedesc` is non-free symbol dependent (see DaCeCodeGenerator.determine_allocation_lifetime). + # We avoid `is_nonfree_sym_dependent` when dfg is None and `nodedesc` is a View. + if dfg and not sdutil.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): + raise NotImplementedError( + "declare_array is only for variables that require separate declaration and allocation.") + + if nodedesc.storage == dtypes.StorageType.GPU_Shared: + raise NotImplementedError("Dynamic shared memory unsupported") + + if nodedesc.storage == dtypes.StorageType.Register: + raise ValueError("Dynamic allocation of registers is not allowed") + + if nodedesc.storage not in {dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned}: + raise NotImplementedError(f"CUDA: Unimplemented storage type {nodedesc.storage.name}.") + + if self._dispatcher.declared_arrays.has(ptrname): + return # Already declared + + # ----------------- Declaration -------------------- + dataname = node.data + array_ctype = f'{nodedesc.dtype.ctype} *' + declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) + self._dispatcher.declared_arrays.add(dataname, DefinedType.Pointer, array_ctype) + + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + """ + Maybe document here that this also does declaration and that declare_array only declares specific + kind of data + """ + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------- Guard checks & Redirect to CPU CodeGen ------------- + + # Skip if variable is already defined + if self._dispatcher.defined_vars.has(dataname): + return + + if isinstance(nodedesc, dace.data.Stream): + raise NotImplementedError("allocate_stream not implemented in ExperimentalCUDACodeGen") + + elif isinstance(nodedesc, dace.data.View): + return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, + allocation_stream) + elif isinstance(nodedesc, dace.data.Reference): + return self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, + declaration_stream, allocation_stream) + + # No clue what is happening here + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): + nodedesc = update_persistent_desc(nodedesc, sdfg) + + # NOTE: Experimental for GPU stream + if nodedesc.dtype == dtypes.gpuStream_t: + return + + # ------------------- Allocation/Declaration ------------------- + + # Call the appropriate handler based on storage type + gen = getattr(self, f'_prepare_{nodedesc.storage.name}_array', None) + if gen: + gen(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + else: + raise NotImplementedError(f'CUDA: Unimplemented storage type {nodedesc.storage}') + + def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------------- Declaration ------------------- + declared = self._dispatcher.declared_arrays.has(dataname) + + if not declared: + array_ctype = f'{nodedesc.dtype.ctype} *' + declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + # ------------------- Allocation ------------------- + arrsize = nodedesc.total_size + arrsize_malloc = f'{sym2cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' + + if nodedesc.pool: + gpu_stream_manager = self._gpu_stream_manager + gpu_stream = gpu_stream_manager.get_stream_node(node) + allocation_stream.write( + f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {gpu_stream}));\n', + cfg, state_id, node) + + # Generate synchronization and error-check calls if sync debugging is enabled + allocation_stream.write(generate_sync_debug_call()) + + else: + # Strides are left to the user's discretion + allocation_stream.write(f'DACE_GPU_CHECK({self.backend}Malloc((void**)&{dataname}, {arrsize_malloc}));\n', + cfg, state_id, node) + + # ------------------- Initialization ------------------- + if node.setzero: + allocation_stream.write(f'DACE_GPU_CHECK({self.backend}Memset({dataname}, 0, {arrsize_malloc}));\n', cfg, + state_id, node) + + if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: + allocation_stream.write(f'{dataname} += {sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node) + + def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------------- Declaration ------------------- + declared = self._dispatcher.declared_arrays.has(dataname) + + if not declared: + array_ctype = f'{nodedesc.dtype.ctype} *' + declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + # ------------------- Allocation ------------------- + arrsize = nodedesc.total_size + arrsize_malloc = f'{sym2cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' + + # Strides are left to the user's discretion + allocation_stream.write(f'DACE_GPU_CHECK({self.backend}MallocHost(&{dataname}, {arrsize_malloc}));\n', cfg, + state_id, node) + if node.setzero: + allocation_stream.write(f'memset({dataname}, 0, {arrsize_malloc});\n', cfg, state_id, node) + + if nodedesc.start_offset != 0: + allocation_stream.write(f'{dataname} += {sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node) + + def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + arrsize = nodedesc.total_size + + # ------------------- Guard checks ------------------- + if symbolic.issymbolic(arrsize, sdfg.constants): + raise NotImplementedError('Dynamic shared memory unsupported') + if nodedesc.start_offset != 0: + raise NotImplementedError('Start offset unsupported for shared memory') + + # ------------------- Declaration ------------------- + array_ctype = f'{nodedesc.dtype.ctype} *' + + declaration_stream.write(f'__shared__ {nodedesc.dtype.ctype} {dataname}[{sym2cpp(arrsize)}];\n', cfg, state_id, + node) + + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + # ------------------- Initialization ------------------- + if node.setzero: + allocation_stream.write( + f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(sym2cpp(self._current_kernel_spec.block_dims))}, {sym2cpp(arrsize)}, ' + f'1, false>::Reset({dataname});\n', cfg, state_id, node) + + def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------------- Guard checks ------------------- + if symbolic.issymbolic(arrsize, sdfg.constants): + raise ValueError('Dynamic allocation of registers not allowed') + if nodedesc.start_offset != 0: + raise NotImplementedError('Start offset unsupported for registers') + + # ------------------- Declaration & Initialization ------------------- + arrsize = nodedesc.total_size + array_ctype = '{nodedesc.dtype.ctype} *' + init_clause = ' = {0}' if node.setzero else '' + + declaration_stream.write(f'{nodedesc.dtype.ctype} {dataname}[{sym2cpp(arrsize)}]{init_clause};\n', cfg, + state_id, node) + + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # Adjust offset if needed + if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: + dataname = f'({dataname} - {sym2cpp(nodedesc.start_offset)})' + + # Remove declaration info + if self._dispatcher.declared_arrays.has(dataname): + is_global = nodedesc.lifetime in ( + dtypes.AllocationLifetime.Global, + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External, + ) + self._dispatcher.declared_arrays.remove(dataname, is_global=is_global) + + # Special case: Stream + if isinstance(nodedesc, dace.data.Stream): + raise NotImplementedError('stream code is not implemented in ExperimentalCUDACodeGen (yet)') + + # Special case: View - no deallocation + if isinstance(nodedesc, dace.data.View): + return + + # Main deallocation logic by storage type + if nodedesc.storage == dtypes.StorageType.GPU_Global: + if nodedesc.pool: + if (sdfg, dataname) not in self.pool_release: + gpu_stream = self._gpu_stream_manager.get_stream_node(node) + callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeAsync({dataname}, {gpu_stream}));\n', cfg, + state_id, node) + else: + callsite_stream.write(f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', cfg, state_id, node) + + elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: + if nodedesc.dtype == dtypes.gpuStream_t: + return + callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeHost({dataname}));\n', cfg, state_id, node) + + elif nodedesc.storage in {dtypes.StorageType.GPU_Shared, dtypes.StorageType.Register}: + # No deallocation needed + return + + else: + raise NotImplementedError(f'Deallocation not implemented for storage type: {nodedesc.storage.name}') + + def get_generated_codeobjects(self): + fileheader = CodeIOStream() + + self._frame.generate_fileheader(self._global_sdfg, fileheader, 'cuda') + + # The GPU stream array is set to have a persistent allocation lifetime (see preprocess GPU stream pipeline). + # Thus the definition of the GPU stream array in the state struct and the access to it is handled elsewhere and + # in several different files (e.g., framecode.py, cpu.py, cpp.py). For the sake of consistency, we initialize it + # as it is expected in the other modules. I.e. prepend with an ID for all SDFGs it is defined. + # Note that all the different variable names point to the same GPU stream array. + cnt = 0 + init_gpu_stream_vars = "" + gpu_stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(",")[0] + for csdfg, name, desc in self._global_sdfg.arrays_recursive(include_nested_data=True): + if name == gpu_stream_array_name and desc.lifetime == dtypes.AllocationLifetime.Persistent: + init_gpu_stream_vars = f"__state->__{csdfg.cfg_id}_{name}" + break + + # My comment: takes codeblocks and transforms it nicely to code + initcode = CodeIOStream() + for sd in self._global_sdfg.all_sdfgs_recursive(): + if None in sd.init_code: + initcode.write(codeblock_to_cpp(sd.init_code[None]), sd) + if 'cuda' in sd.init_code: + initcode.write(codeblock_to_cpp(sd.init_code['cuda']), sd) + initcode.write(self._initcode.getvalue()) + + exitcode = CodeIOStream() + for sd in self._global_sdfg.all_sdfgs_recursive(): + if None in sd.exit_code: + exitcode.write(codeblock_to_cpp(sd.exit_code[None]), sd) + if 'cuda' in sd.exit_code: + exitcode.write(codeblock_to_cpp(sd.exit_code['cuda']), sd) + exitcode.write(self._exitcode.getvalue()) + + if self.backend == 'cuda': + backend_header = 'cuda_runtime.h' + elif self.backend == 'hip': + backend_header = 'hip/hip_runtime.h' + else: + raise NameError('GPU backend "%s" not recognized' % self.backend) + + params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg)) + if params_comma: + params_comma = ', ' + params_comma + + pool_header = '' + if self.has_pool: + poolcfg = Config.get('compiler', 'cuda', 'mempool_release_threshold') + pool_header = f''' + cudaMemPool_t mempool; + cudaDeviceGetDefaultMemPool(&mempool, 0); + uint64_t threshold = {poolcfg if poolcfg != -1 else 'UINT64_MAX'}; + cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold); +''' + + self._codeobject.code = """ +#include <{backend_header}> +#include + +{file_header} + +DACE_EXPORTED int __dace_init_experimental_cuda({sdfg_state_name} *__state{params}); +DACE_EXPORTED int __dace_exit_experimental_cuda({sdfg_state_name} *__state); + +{other_globalcode} + +int __dace_init_experimental_cuda({sdfg_state_name} *__state{params}) {{ + int count; + + // Check that we are able to run {backend} code + if ({backend}GetDeviceCount(&count) != {backend}Success) + {{ + printf("ERROR: GPU drivers are not configured or {backend}-capable device " + "not found\\n"); + return 1; + }} + if (count == 0) + {{ + printf("ERROR: No {backend}-capable devices found\\n"); + return 2; + }} + + // Initialize {backend} before we run the application + float *dev_X; + DACE_GPU_CHECK({backend}Malloc((void **) &dev_X, 1)); + DACE_GPU_CHECK({backend}Free(dev_X)); + + {pool_header} + + __state->gpu_context = new dace::cuda::Context({nstreams}, {nevents}); + + // Create {backend} streams and events + for(int i = 0; i < {nstreams}; ++i) {{ + DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->internal_streams[i], {backend}StreamNonBlocking)); + __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams + }} + for(int i = 0; i < {nevents}; ++i) {{ + DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming)); + }} + + {initcode} + + return 0; +}} + +int __dace_exit_experimental_cuda({sdfg_state_name} *__state) {{ + {exitcode} + + // Synchronize and check for CUDA errors + int __err = static_cast(__state->gpu_context->lasterror); + if (__err == 0) + __err = static_cast({backend}DeviceSynchronize()); + + // Destroy {backend} streams and events + for(int i = 0; i < {nstreams}; ++i) {{ + DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->internal_streams[i])); + }} + for(int i = 0; i < {nevents}; ++i) {{ + DACE_GPU_CHECK({backend}EventDestroy(__state->gpu_context->events[i])); + }} + + delete __state->gpu_context; + return __err; +}} + + +{localcode} +""".format(params=params_comma, + sdfg_state_name=mangle_dace_state_struct_name(self._global_sdfg), + initcode=initcode.getvalue(), + exitcode=exitcode.getvalue(), + other_globalcode=self._globalcode.getvalue(), + localcode=self._localcode.getvalue(), + file_header=fileheader.getvalue(), + nstreams=self._gpu_stream_manager.num_gpu_streams, + nevents=self._gpu_stream_manager.num_gpu_events, + backend=self.backend, + backend_header=backend_header, + pool_header=pool_header, + sdfg=self._global_sdfg) + + return [self._codeobject] + + ####################################################################### + # Compilation Related + + @staticmethod + def cmake_options(): + options = [] + + # Override CUDA toolkit + if Config.get('compiler', 'cuda', 'path'): + options.append("-DCUDA_TOOLKIT_ROOT_DIR=\"{}\"".format( + Config.get('compiler', 'cuda', 'path').replace('\\', '/'))) + + # Get CUDA architectures from configuration + backend = common.get_gpu_backend() + if backend == 'cuda': + cuda_arch = Config.get('compiler', 'cuda', 'cuda_arch').split(',') + cuda_arch = [ca for ca in cuda_arch if ca is not None and len(ca) > 0] + + cuda_arch = ';'.join(cuda_arch) + options.append(f'-DDACE_CUDA_ARCHITECTURES_DEFAULT="{cuda_arch}"') + + flags = Config.get("compiler", "cuda", "args") + options.append("-DCMAKE_CUDA_FLAGS=\"{}\"".format(flags)) + + if backend == 'hip': + hip_arch = Config.get('compiler', 'cuda', 'hip_arch').split(',') + hip_arch = [ha for ha in hip_arch if ha is not None and len(ha) > 0] + + flags = Config.get("compiler", "cuda", "hip_args") + flags += " -G -g" + flags += ' ' + ' '.join( + '--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch) + for arch in hip_arch) + options.append("-DEXTRA_HIP_FLAGS=\"{}\"".format(flags)) + + if Config.get('compiler', 'cpu', 'executable'): + host_compiler = make_absolute(Config.get("compiler", "cpu", "executable")) + options.append("-DCUDA_HOST_COMPILER=\"{}\"".format(host_compiler)) + + return options + + ####################################################################### + # Callback to CPU codegen + + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + self._cpu_codegen.define_out_memlet(sdfg, cfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, + callsite_stream) + + def process_out_memlets(self, *args, **kwargs): + # Call CPU implementation with this code generator as callback + self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) + + +######################################################################### +# helper class +# This one is closely linked to the ExperimentalCUDACodeGen. In fact, +# it only exists to not have to much attributes and methods in the ExperimentalCUDACodeGen +# and to group Kernel specific methods & information. Thus, KernelSpec should remain in this file +class KernelSpec: + """ + A helper class to encapsulate information required for working with kernels. + This class provides a structured way to store and retrieve kernel parameters. + """ + + def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: ControlFlowRegion, + dfg_scope: ScopeSubgraphView, state_id: int): + + # Get kernel entry/exit nodes and current state + kernel_map_entry: nodes.MapEntry = dfg_scope.source_nodes()[0] + kernel_parent_state: SDFGState = cfg.state(state_id) + + self._kernel_map_entry: nodes.MapEntry = kernel_map_entry + self._kernels_state: SDFGState = kernel_parent_state + + # Kernel name + self._kernel_name: str = f'{kernel_map_entry.map.label}_{cfg.cfg_id}_{kernel_parent_state.block_id}_{kernel_parent_state.node_id(kernel_map_entry)}' + + # Get and store kernel constants — needed for applying 'const' and updating defined + # constant variable types in the dispatcher (handled at GPU codegen) + kernel_const_data = sdutil.get_constant_data(kernel_map_entry, kernel_parent_state) + kernel_const_symbols = sdutil.get_constant_symbols(kernel_map_entry, kernel_parent_state) + kernel_constants = kernel_const_data | kernel_const_symbols + self._kernel_constants: Set[str] = kernel_constants + + arglist: Dict[str, dt.Data] = cudaCodeGen._kernel_arglists[kernel_map_entry] + self._arglist = arglist + + # save _in_device_code value for restoring later + restore_in_device_code = cudaCodeGen._in_device_code + + # Certain args are called in the CUDA/HIP file or kernel funcion, in which the pointer name of the args are different + cudaCodeGen._in_device_code = True + self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] + + # Special: Persistent arguments + args_typed = [] + for name, data in arglist.items(): + if data.lifetime == dtypes.AllocationLifetime.Persistent: + arg_name = ptr(name, data, sdfg, cudaCodeGen._frame) + else: + arg_name = name + args_typed.append(('const ' if name in kernel_constants else '') + data.as_arg(name=arg_name)) + + self._args_typed = args_typed + + # Args for the kernel wrapper function + cudaCodeGen._in_device_code = False + + # Gather GPU stream information: + # - Use the connector name when passing the stream to the kernel + # - Use the configured variable name (from Config) in the wrapper’s function signature + # (this same name is also used when invoking {backend}LaunchKernel inside the wrapper) + gpustream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + gpustream_input = [ + e for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry) + if e.src.desc(sdfg).dtype == dtypes.gpuStream_t + ] + if len(gpustream_input) > 1: + raise ValueError( + f"There can not be more than one GPU stream assigned to a kernel, but {len(gpustream_input)} were assigned." + ) + + # Final wrapper arguments: + # - State struct (__state) + # - Original kernel args + # - GPU stream + self._kernel_wrapper_args_as_input = ( + ['__state'] + [ptr(name, data, sdfg, cudaCodeGen._frame) + for name, data in arglist.items()] + [str(gpustream_input[0].dst_conn)]) + + self._kernel_wrapper_args_typed = ([f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + + args_typed + [f"gpuStream_t {gpustream_var_name}"]) + + cudaCodeGen._in_device_code = restore_in_device_code + + # The kernel's grid and block dimensions + self._grid_dims, self._block_dims = cudaCodeGen._kernel_dimensions_map[kernel_map_entry] + + # C type of block, thread, and warp indices (as a string) + self._gpu_index_ctype: str = self.get_gpu_index_ctype() + + # Warp size (backend-dependent) + if cudaCodeGen.backend not in ['cuda', 'hip']: + raise ValueError(f"Unsupported backend '{cudaCodeGen.backend}' in ExperimentalCUDACodeGen. " + "Only 'cuda' and 'hip' are supported.") + + warp_size_key = 'cuda_warp_size' if cudaCodeGen.backend == 'cuda' else 'hip_warp_size' + self._warpSize = Config.get('compiler', 'cuda', warp_size_key) + + def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: + """ + Retrieves the GPU index data type as a C type string (for thread, block, warp indices) + from the configuration and if it matches a DaCe data type. + + Raises: + ValueError: If the configured type does not match a DaCe data type. + + Returns: + str: + The C type string corresponding to the configured GPU index type. + Used for defining thread, block, and warp indices in the generated code. + """ + type_name = Config.get('compiler', 'cuda', config_key) + dtype = getattr(dtypes, type_name, None) + if not isinstance(dtype, dtypes.typeclass): + raise ValueError( + f'Invalid {config_key} "{type_name}" configured (used for thread, block, and warp indices): ' + 'no matching DaCe data type found.\n' + 'Please use a valid type from dace.dtypes (e.g., "int32", "uint64").') + return dtype.ctype + + @property + def kernel_constants(self) -> Set[str]: + """Returns the kernel's constant data and symbols.""" + return self._kernel_constants + + @property + def kernel_name(self) -> list[str]: + """Returns the kernel (function's) name.""" + return self._kernel_name + + @property + def kernel_map_entry(self) -> nodes.MapEntry: + """ + Returns the entry node of the kernel, which is a MapEntry node + scheduled with dace.dtypes.ScheduleType.GPU_Device. + """ + return self._kernel_map_entry + + @property + def kernel_map(self) -> nodes.Map: + """Returns the kernel's map node.""" + return self._kernel_map_entry.map + + @property + def arglist(self) -> Dict[str, dt.Data]: + """ + Returns a dictionary of arguments for the kernel's subgraph, + mapping each data name to its corresponding data descriptor. + """ + return self._arglist + + @property + def args_as_input(self) -> list[str]: + """ + Returns the kernel function arguments formatted for use as inputs + when calling/launching the kernel function. + """ + return self._args_as_input + + @property + def args_typed(self) -> list[str]: + """ + Returns the typed kernel function arguments suitable for declaring + the kernel function. Each argument includes its corresponding data type. + """ + return self._args_typed + + @property + def kernel_wrapper_args_as_input(self) -> list[str]: + """ + Returns the argument names passed to the kernel wrapper function. + + The kernel wrapper is a function defined in the CUDA/HIP code that is called + from the CPU code and is responsible for launching the kernel function. + """ + return self._kernel_wrapper_args_as_input + + @property + def kernel_wrapper_args_typed(self) -> list[str]: + """ + Returns the typed arguments used to declare the kernel wrapper function. + + The kernel wrapper is defined in the CUDA/HIP code, called from the CPU side, + and is responsible for launching the actual kernel function. + """ + return self._kernel_wrapper_args_typed + + @property + def grid_dims(self) -> list: + """Returns the grid dimensions of the kernel.""" + return self._grid_dims + + @property + def block_dims(self) -> list: + """Returns the block dimensions of the kernel.""" + return self._block_dims + + @property + def warpSize(self) -> int: + """ + Returns the warp size used in this kernel. + This value depends on the selected backend (CUDA or HIP) + and is retrieved from the configuration. + """ + return self._warpSize + + @property + def gpu_index_ctype(self) -> str: + """ + Returns the C data type used for GPU indices (thread, block, warp) + in generated code. This type is determined by the 'gpu_index_type' + setting in the configuration and matches with a DaCe typeclass. + """ + return self._gpu_index_ctype diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py new file mode 100644 index 0000000000..3982f3a86d --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -0,0 +1,756 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple, Union + +from dace import SDFG, SDFGState, data, dtypes, subsets +from dace import memlet as mm +from dace import symbolic +from dace.codegen import common +from dace.codegen.targets import cpp +from dace.codegen.targets.cpp import sym2cpp, unparse_cr +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import generate_sync_debug_call +from dace.config import Config +from dace.dtypes import StorageType +from dace.frontend import operations +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import helpers + + +class CopyContext: + """ + Encapsulates inputs required for copy operations and exposes helper + methods to derive additional information. This keeps copy strategies + lightweight by letting them focus only on the relevant logic. + """ + + def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]): + + # Store the basic context as attributes + self.sdfg = sdfg + self.state = state + self.src_node = src_node + self.dst_node = dst_node + self.edge = edge + self.gpustream_assignments = gpustream_assignments + + memlet = edge.data + + self.copy_shape = memlet.subset.size_exact() + if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): + copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info() + else: + copy_shape = memlet.subset.size_exact() + src_strides = dst_strides = src_expr = dst_expr = None + + self.copy_shape = copy_shape + self.src_strides = src_strides + self.dst_strides = dst_strides + self.src_expr = src_expr + self.dst_expr = dst_expr + + def get_storage_type(self, node: nodes.Node): + """ + Return the storage type associated with a given SDFG node. + + Tasklets are assumed to use register storage, while AccessNodes + return the storage type from their data descriptor. Raises + NotImplementedError for unsupported node types. + """ + if isinstance(node, nodes.Tasklet): + storage_type = StorageType.Register + + elif isinstance(node, nodes.AccessNode): + storage_type = node.desc(self.sdfg).storage + + else: + raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; " + "expected AccessNode or Tasklet. Please extend this method accordingly.") + + return storage_type + + def get_assigned_gpustream(self) -> str: + """ + Return the GPU stream expression assigned to both source and destination nodes. + + Ensures that both nodes have a matching stream ID, then constructs the + variable name from the configured prefix and stream ID. Raises ValueError + if assignments are missing or inconsistent. + + Example: + If the configured prefix is 'gpu_stream' and the assigned stream ID is 0, + this method returns 'gpu_stream0'. + """ + src_stream = self.gpustream_assignments.get(self.src_node) + dst_stream = self.gpustream_assignments.get(self.dst_node) + + # 1. Catch unsupported cases + if src_stream is None or dst_stream is None: + raise ValueError("GPU stream assignment missing for source or destination node.") + + if src_stream != dst_stream: + raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', " + f"dst_node has '{dst_stream}'. They must be the same.") + + # 2. Generate GPU stream expression + gpustream = src_stream + gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}" + + return gpustream_expr + + def get_memory_location(self) -> Tuple[str, str]: + """ + Determine whether the source and destination nodes reside in device or host memory. + + Uses the storage type of each node to classify it as either 'Device' + (GPU global memory) or 'Host' (all other storage types). + Used for GPU related copies outside the kernel (e.g. to construct + cudaMemcpyHostToDevice for example). + + Returns + ------- + Tuple[str, str] + (src_location, dst_location) where each is either 'Device' or 'Host'. + """ + src_storage = self.get_storage_type(self.src_node) + dst_storage = self.get_storage_type(self.dst_node) + src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + return src_location, dst_location + + def get_ctype(self) -> Any: + """ + Determine the C data type (ctype) of the source or destination node. + + The ctype is resolved from the data descriptor of the first node + (source or destination) that is an AccessNode (assumed to be the same + if both are AccessNodes). + + Returns + ------- + Any + The C type string (e.g., "float*", "int32") associated with the node. + + Raises + ------ + NotImplementedError + If neither the source nor the destination node is an AccessNode. + """ + sdfg = self.sdfg + src_node, dst_node = self.src_node, self.dst_node + + if isinstance(src_node, nodes.AccessNode): + return src_node.desc(sdfg).ctype + + if isinstance(dst_node, nodes.AccessNode): + return dst_node.desc(sdfg).ctype + + raise NotImplementedError( + f"Cannot determine ctype: neither src nor dst node is an AccessNode. " + f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. " + "Please extend this case or fix the issue.") + + def get_accessnode_to_accessnode_copy_info(self): + """ + Compute copy shape, absolute strides, and pointer expressions for a copy + between two AccessNodes. Tries to mimic + cpp.memlet_copy_to_absolute_strides without requiring a dispatcher. + + Returns + ------- + (copy_shape, src_strides, dst_strides, src_expr, dst_expr) + + Raises + ------ + TypeError + If either endpoint is not an AccessNode. + NotImplementedError + If a descriptor is not Scalar or Array. + """ + + # ---------------------------- helpers ---------------------------- + def _collapse_strides(strides, subset): + """Remove size-1 dims; keep tile strides; default to [1] if none remain.""" + n = len(subset) + collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1] + collapsed.extend(strides[n:]) # include tiles + if len(collapsed) == 0: + return [1] + return collapsed + + def _ptr_name(desc, name): + if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External): + return f'__state->__{sdfg.cfg_id}_{name}' + return name + + def _expr_for(desc, name, subset): + ptr = _ptr_name(desc, name) + + if isinstance(desc, data.Scalar): + # GPU scalar special-case + if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: + parent = state.sdfg.parent_nsdfg_node + if parent is not None and name in parent.in_connectors: + return f"&{ptr}" + return ptr + # CPU (or other) scalars + return f"&{ptr}" + + if isinstance(desc, data.Array): + offset = cpp.cpp_offset_expr(desc, subset) + return f"{ptr} + {offset}" if offset != "0" else ptr + + raise NotImplementedError( + f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.") + + # ---------------------------- Get copy info ---------------------------- + # Get needed information + src_node, dst_node = self.src_node, self.dst_node + sdfg, edge, state = self.sdfg, self.edge, self.state + memlet, copy_shape = self.edge.data, self.copy_shape + + # Guard - only applicable if src and dst are AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + raise TypeError( + f"get_accessnode_to_accessnode_copy_info requires both source and destination " + f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.") + + # Get node descriptors + src_nodedesc = src_node.desc(sdfg) + dst_nodedesc = dst_node.desc(sdfg) + + # Resolve subsets (fallback to full range) + src_subset = memlet.get_src_subset(edge, state) + dst_subset = memlet.get_dst_subset(edge, state) + + if src_subset is None: + src_subset = subsets.Range.from_array(src_nodedesc) + + if dst_subset is None: + dst_subset = subsets.Range.from_array(dst_nodedesc) + + # Get strides + src_strides = src_subset.absolute_strides(src_nodedesc.strides) + dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) + + # Try to convert to a degenerate/strided ND copy first + result = cpp.ndcopy_to_strided_copy( + copy_shape, + src_nodedesc.shape, + src_strides, + dst_nodedesc.shape, + dst_strides, + memlet.subset, + src_subset, + dst_subset, + ) + + if result is not None: + copy_shape, src_strides, dst_strides = result + else: + src_strides = _collapse_strides(src_strides, src_subset) + dst_strides = _collapse_strides(dst_strides, dst_subset) + copy_shape = [s for s in copy_shape if s != 1] or [1] + + # Extend copy shape to the largest among the data dimensions, + # and extend other array with the appropriate strides + if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape): + if memlet.data == src_node.data: + copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape) + elif memlet.data == dst_node.data: + copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + + # Build final expressions + src_expr = _expr_for(src_nodedesc, src_node.data, src_subset) + dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset) + + return copy_shape, src_strides, dst_strides, src_expr, dst_expr + + +class CopyStrategy(ABC): + """Abstract base class for memory copy strategies.""" + + @abstractmethod + def applicable(self, copy_context: CopyContext) -> bool: + """ + Return True if this strategy can handle the given memory copy. + """ + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate_copy(self, copy_context: CopyContext) -> str: + """ + Generates and returns the copy code for the supported pattern. + """ + raise NotImplementedError('Abstract class') + + +class OutOfKernelCopyStrategy(CopyStrategy): + """ + Copy strategy for memory transfers that occur outside of kernel execution. + + This pattern often occurs when generating host-to-device copies for kernel inputs + (since kernels cannot access host memory directly), and device-to-host copies + to retrieve results for further processing. + """ + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Determines whether the data movement is a host<->device memory copy. + + This function returns True if: + - We are not currently generating kernel code + - The copy occurs between two AccessNodes + - The data descriptors of source and destination are not views. + - The storage types of either src or dst is CPU_Pinned or GPU_Device + - We do not have a CPU-to-CPU copy + """ + # Retrieve needed information + state = copy_context.state + src_node, dst_node = copy_context.src_node, copy_context.dst_node + + # 1. Ensure copy is not occuring within a kernel + scope_dict = state.scope_dict() + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + + parent_map_tuple = helpers.get_parent_map(state, deeper_node) + while parent_map_tuple is not None: + parent_map, parent_state = parent_map_tuple + if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return False + else: + parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) + + # 2. Check whether copy is between two AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False + + # 3. The data descriptors of source and destination are not views + if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): + return False + + # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) + or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): + return False + + # 5. Check that this is not a CPU to CPU copy + cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] + if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: + return False + + return True + + def generate_copy(self, copy_context: CopyContext) -> str: + """Execute host-device copy with CUDA memory operations""" + + # Guard + memlet = copy_context.edge.data + if memlet.wcr is not None: + src_location, dst_location = copy_context.get_memory_location() + raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') + + # Based on the copy dimension, call appropiate helper function + num_dims = len(copy_context.copy_shape) + if num_dims == 1: + copy_call = self._generate_1d_copy(copy_context) + + elif num_dims == 2: + copy_call = self._generate_2d_copy(copy_context) + + else: + # sanity check + assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." + copy_call = self._generate_nd_copy(copy_context) + + return copy_call + + def _generate_1d_copy(self, copy_context: CopyContext) -> str: + """ + Generates a 1D memory copy between host and device using the GPU backend. + + Uses {backend}MemcpyAsync for contiguous memory. For strided memory, + {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension. + """ + # ----------- Retrieve relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call -------------------- + + if is_contiguous_copy: + # Memory is linear: can use {backend}MemcpyAsync + copysize = ' * '.join(sym2cpp(copy_shape)) + copysize += f' * sizeof({ctype})' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n' + + else: + # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch + # This allows copying a strided 1D region + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call + + def _generate_2d_copy(self, copy_context: CopyContext) -> None: + """ + Generates a 2D memory copy using {backend}Memcpy2DAsync. + + Three main cases are handled: + - Copy between row-major stored arrays with contiguous rows. + - Copy between column-major stored arrays with contiguous columns. + - A special case where a 2D copy can still be represented. + + Raises: + NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns. + Such cases indicate an unsupported 2D copy and should be examined separately. + They can be implemented if valid, or a more descriptive error should be raised if the path should not occur. + + Note: + {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column), + but not both simultaneously. + """ + + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call if supported -------------------- + + # Case: Row-major layout, rows are not strided. + if (src_strides[1] == 1) and (dst_strides[1] == 1): + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[0])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Case: Column-major layout, no columns are strided. + elif (src_strides[0] == 1) and (dst_strides[0] == 1): + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[1])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Special case + elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]): + # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with + # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a + # {backend}Memcpy2DAsync call! + + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0] * copy_shape[1]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + else: + raise NotImplementedError( + f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." + "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." + ) + + return call + + def _generate_nd_copy(self, copy_context: CopyContext) -> None: + """ + Generates GPU code for copying N-dimensional arrays using 2D memory copies. + + Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops + for any outer dimensions. Expects the copy to be contiguous and between + row-major storage locations. + """ + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + num_dims = len(copy_shape) + + # ----------- Guard for unsupported Pattern -------------- + if not (src_strides[-1] == 1) and (dst_strides[-1] == 1): + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + raise NotImplementedError( + "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n" + f" Source node: {src_node} (storage: {src_storage})\n" + f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" + f" Source strides: {src_strides}\n" + f" Destination strides: {dst_strides}\n" + f" copy shape: {copy_shape}\n") + + # ----------------- Generate and write backend call(s) -------------------- + + call = "" + # Write for-loop headers + for dim in range(num_dims - 2): + call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" + + # Write Memcopy2DAsync + offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2])) + + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' + + dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})' + height = sym2cpp(copy_shape[-2]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + # Generate call and write it + call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Write for-loop footers + for dim in range(num_dims - 2): + call += "\n}" + + # Return the code + return call + + +class SyncCollaboritveGPUCopyStrategy(CopyStrategy): + """ + Implements (synchronous) collaborative GPU copy operations. + + This strategy generates the appropriate code for copies performed + inside GPU kernels, where multiple threads cooperate to move data + between gpu memory spaces (e.g., global to shared memory). + """ + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Checks if the copy is eligible for a collaborative GPU-to-GPU copy. + + Conditions: + 1. The copy is between two AccessNodes + 2. The copy is between GPU memory StorageTypes (shared or global). + 3. The innermost non-sequential map is a GPU_Device-scheduled map i.e. + the copy occurs within a kernel but is not within a GPU_ThreadBlock map. + """ + # --- Condition 1: src and dst are AccessNodes --- + src_node, dst_node = copy_context.src_node, copy_context.dst_node + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False + + # --- Condition 2: GPU to GPU memory transfer --- + src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) + gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} + + if not (src_storage in gpu_storages and dst_storage in gpu_storages): + return False + + # --- Condition 3: Next non-sequential Map is a GPU_Device Map --- + next_nonseq_parent_map = self._next_non_seq_parent_map(copy_context) + if next_nonseq_parent_map is None: + return False + else: + return next_nonseq_parent_map.map.schedule == dtypes.ScheduleType.GPU_Device + + def generate_copy(self, copy_context: CopyContext, kernel_dimensions_maps: Dict[nodes.MapEntry, + Tuple[List, List]]) -> str: + """ + Generates a GPU copy call as a string using DaCe's runtime CUDA copy functions. + + The function determines the appropriate templated copy function from + `dace/libraries/runtime/include/dace/cuda/copy.cuh` and constructs + the call string with the necessary arguments, including kernel block + dimensions and optional accumulation/reduction information. + + Parameters + ---------- + copy_context : CopyContext + Helper object containing information about the copy. + + kernel_dimensions_maps : Dict[nodes.MapEntry, Tuple[List, List]] + Kernel map (GPU_Devie scheduled map) entry nodes to (grid_dims, block_dims); + block_dims needed in templating. + + Returns + ------- + str + The GPU copy call in C++ as a string. + + Notes + ----- + - The kernel block size could be derived, but since this function is typically called + from `ExperimentalCUDACodeGen`, it is provided as input to avoid recomputation. + - The template functions use a parameter called 'is_async', which is set to True here + because `ExperimentalCUDACodeGen` inserts "__syncthreads()" explicitly in tasklets. + """ + # ----------- Retrieve relevant copy information -------------- + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + sdfg = copy_context.sdfg + dtype = copy_context.src_node.desc(sdfg).dtype + ctype = dtype.ctype + + # Get copy function name (defined in runtime library) + num_dims = len(copy_shape) + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) + src_storage_name = self._get_storagename(src_storage) + dst_storage_name = self._get_storagename(dst_storage) + function_name = f"dace::{src_storage_name}To{dst_storage_name}{num_dims}D" + + # Extract WCR info (accumulation template + optional custom reduction) + accum, custom_reduction = self._get_accumulation_info(copy_context) + custom_reduction = [custom_reduction] if custom_reduction else [] + + # Get parent kernel block dimensions (guaranteed GPU_Device) and sync flag + parent_kernel = self._next_non_seq_parent_map(copy_context) + block_dims = ", ".join(sym2cpp(kernel_dimensions_maps[parent_kernel][1])) + synchronized = "true" # Legacy 'is_async'; sync barriers handled by passes (see docstring) + + # ------------------------- Generate copy call ---------------------------- + + if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): + args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction + dst_strides + copy_shape) + args = ", ".join(sym2cpp(args_list)) + call = f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});" + + elif function_name == "dace::SharedToGlobal1D": + copy_size = ', '.join(sym2cpp(copy_shape)) + accum = accum or '::Copy' + args_list = ([src_expr] + src_strides + [dst_expr] + dst_strides + custom_reduction) + args = ", ".join(sym2cpp(args_list)) + call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});" + + else: + copy_size = ', '.join(sym2cpp(copy_shape)) + args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction) + args = ", ".join(sym2cpp(args_list)) + dst_strides_unpacked = ", ".join(sym2cpp(dst_strides)) + call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});" + + return call + + def _get_accumulation_info(self, copy_context: CopyContext) -> Tuple[str, str]: + """ + Extracts write-conflict resolution (WCR) information from the copy context + and returns the accumulation/reduction template components needed for the + final templated function call in `generate_copy()`. + + This method processes WCR information from the memlet and generates the + appropriate C++ template strings for both predefined and custom reductions. + + Parameters + ---------- + copy_context : CopyContext + Copy context containing the copy operation details, including + the memlet with WCR information. + + Returns + ------- + Tuple[str, str] + A tuple containing: + - accum : str + Template accumulation string for the function call. Empty string if no WCR, + `"::template Accum"` for predefined reductions, or `"::template Accum"` for custom reductions. + - custom_reduction : str + C++ formatted custom reduction code string. Empty string for no WCR or predefined reductions, + unparsed custom reduction code for custom reductions. + """ + sdfg = copy_context.sdfg + dtype = copy_context.src_node.desc(sdfg).dtype + memlet = copy_context.edge.data + wcr = memlet.wcr + reduction_type = operations.detect_reduction_type(wcr) + + if wcr is None: + accum, custom_reduction = "", "" + + elif reduction_type != dtypes.ReductionType.Custom: + # Use predefined reduction + reduction_type_str = str(reduction_type).split(".")[-1] # e.g., "Sum" + accum = f"::template Accum" + custom_reduction = "" + + else: + accum = "::template Accum" + custom_reduction = unparse_cr(sdfg, wcr, dtype) + + return accum, custom_reduction + + def _get_storagename(self, storage: dtypes.StorageType): + """ + Returns a string containing the name of the storage location. + + Example: dtypes.StorageType.GPU_Shared will return "Shared". + """ + storage_name = str(storage) + return storage_name[storage_name.rindex('_') + 1:] + + def _next_non_seq_parent_map(self, copy_context: CopyContext) -> Optional[nodes.MapEntry]: + """ + Traverse up the parent map chain from the deeper of src_node or dst_node + in `copy_context` and return the first parent MapEntry whose schedule + is not sequential. + + Parameters + ---------- + copy_context : CopyContext + Context information about the memory copy. + + Returns + ------- + Optional[nodes.MapEntry] + The first non-sequential parent MapEntry encountered, or None if no + such parent exists. + """ + src_node, dst_node = copy_context.src_node, copy_context.dst_node + state = copy_context.state + scope_dict = state.scope_dict() + + # Determine which node (src or dst) is in the deeper scope + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + current_node = deeper_node + while (current_node is None or not isinstance(current_node, nodes.MapEntry) + or current_node.map.schedule == dtypes.ScheduleType.Sequential): + parent = helpers.get_parent_map(state, current_node) + if parent is None: + current_node = None + break + current_node, state = parent + + return current_node diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py new file mode 100644 index 0000000000..329547331a --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py @@ -0,0 +1,80 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Dict, Union +from dace import SDFG, nodes + + +class GPUStreamManager: + """ + Manage GPU backend streams (e.g., CUDA or HIP) for nodes in an SDFG. + + Nodes are assigned stream IDs by the NaiveGPUStreamScheduler Pass, and + this class provides their access expressions and tracks the number of streams + in use. GPU events are not (yet) supported. + + Note + ---- + "Stream" refers to backend GPU streams, not DaCe data streams. + """ + + def __init__(self, sdfg: SDFG, gpustream_assignments: Dict[nodes.Node, int]): + self.sdfg = sdfg + self._stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" + self._gpustream_assignments = gpustream_assignments + self._num_gpu_streams = max(gpustream_assignments.values()) + 1 if gpustream_assignments else 0 + self._num_gpu_events = 0 + + def get_stream_node(self, node: nodes.Node) -> str: + """ + Return the access expression for the GPU stream assigned to a node. + + Parameters + ---------- + node : nodes.Node + The node for which to return the access expression of its assigned CUDA stream. + + Returns + ------- + str + The GPU stream access expression, e.g., + "__state->gpu_context->streams[0]". + + Raises + ------ + ValueError + If the given node does not have an assigned stream. + """ + if node in self.gpustream_assignments: + return self._stream_access_template.format(gpu_stream=self.gpustream_assignments[node]) + else: + raise ValueError(f"No GPU stream assigned to node {node}. " + "Check whether the node is relevant for GPU stream assignment and, if it is, " + "inspect the GPU stream pipeline to see why no stream was assigned.") + + def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: + """ + Returns the GPU stream access expression for an edge. + + Currently unused: edge-level streams were only needed for asynchronous + memory-copy operations (e.g., cudaMemcpyAsync). These copies are now + modeled via tasklets in the SDFG, so edges do not carry stream info. + Implement this if the design changes and edges need streams again. + """ + raise NotImplementedError("Edge-level GPU streams are not supported. " + "They were previously used for asynchronous memory copies (e.g., cudaMemcpyAsync), " + "but these are now modeled via tasklets in the SDFG. " + "Implement this if the design changes and edges must carry GPU stream information.") + + @property + def num_gpu_events(self) -> int: + """Number of GPU events (currently always 0, left here for potential future support).""" + return 0 + + @property + def num_gpu_streams(self) -> int: + """Number of GPU streams in use (stream IDs start at 0).""" + return self._num_gpu_streams + + @property + def gpustream_assignments(self) -> Dict[nodes.Node, int]: + """Mapping of nodes to assigned GPU stream IDs (not all nodes necessarily have a GPU stream ID).""" + return self._gpustream_assignments diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py new file mode 100644 index 0000000000..27c073afc8 --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -0,0 +1,163 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import functools + +import sympy +from typing import Set, List + +import dace +from dace import Config, data as dt, dtypes +from dace.sdfg import nodes, SDFGState +from dace.codegen import common +from dace.codegen.dispatcher import DefinedType +from dace.transformation.helpers import get_parent_map + + +def get_cuda_dim(idx): + """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """ + if idx < 0 or idx > 2: + raise ValueError(f'idx must be between 0 and 2, got {idx}') + return ('x', 'y', 'z')[idx] + + +def product(iterable): + """ + Computes the symbolic product of elements in the iterable using sympy.Mul. + + This is equivalent to: ```functools.reduce(sympy.Mul, iterable, 1)```. + + Purpose: This function is used to improve readability of the codeGen. + """ + return functools.reduce(sympy.Mul, iterable, 1) + + +def to_3d_dims(dim_sizes: List) -> List: + """ + Converts a list of dimension sizes to a 3D format. + + If the list has more than three dimensions, all dimensions beyond the second are + collapsed into the third (via multiplication). If the list has fewer than three + entries, it is padded with 1s to ensure a fixed length of three. + + Examples: + [x] → [x, 1, 1] + [x, y] → [x, y, 1] + [x, y, z] → [x, y, z] + [x, y, z, u, v] → [x, y, z * u * v] + """ + + if len(dim_sizes) > 3: + # multiply everything from the 3rd onward into d[2] + dim_sizes[2] = product(dim_sizes[2:]) + dim_sizes = dim_sizes[:3] + + # pad with 1s if necessary + dim_sizes += [1] * (3 - len(dim_sizes)) + + return dim_sizes + + +def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: List): + """ + Validates that the given block size for a kernel does not exceed typical CUDA hardware limits. + + These limits are not enforced by the CUDA compiler itself, but are configurable checks + performed by DaCe during GPU code generation. They are based on common hardware + restrictions and can be adjusted via the configuration system. + + Specifically, this function checks: + - That the total number of threads in the block does not exceed `compiler.cuda.block_size_limit`. + - That the number of threads in the last (z) dimension does not exceed + `compiler.cuda.block_size_lastdim_limit`. + + Raises: + ValueError: If either limit is exceeded. + """ + kernel_map_label = kernel_map_entry.map.label + + total_block_size = product(block_size) + limit = Config.get('compiler', 'cuda', 'block_size_limit') + lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') + + if (total_block_size > limit) == True: + raise ValueError(f'Block size for kernel "{kernel_map_label}" ({block_size}) ' + f'is larger than the possible number of threads per block ({limit}). ' + 'The kernel will potentially not run, please reduce the thread-block size. ' + 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' + 'configuration entry.') + + if (block_size[-1] > lastdim_limit) == True: + raise ValueError(f'Last block size dimension for kernel "{kernel_map_label}" ({block_size}) ' + 'is larger than the possible number of threads in the last block dimension ' + f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' + 'thread-block size. To increase this limit, modify the ' + '`compiler.cuda.block_size_lastdim_limit` configuration entry.') + + +def generate_sync_debug_call() -> str: + """ + Generate backend sync and error-check calls as a string if + synchronous debugging is enabled. + + Parameters + ---------- + backend : str + Backend API prefix (e.g., 'cuda'). + + Returns + ------- + str + The generated debug call code, or an empty string if debugging is disabled. + """ + backend: str = common.get_gpu_backend() + sync_call: str = "" + if Config.get_bool('compiler', 'cuda', 'syncdebug'): + sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") + + return sync_call + + +def get_defined_type(data: dt.Data) -> DefinedType: + """ + Return the DefinedType for a data descriptor. + Currently supports only scalars and arrays; extend if others are needed. + """ + if isinstance(data, dt.Scalar): + return DefinedType.Scalar + elif isinstance(data, dt.Array): + return DefinedType.Pointer + else: + raise NotImplementedError(f"Data type '{type(data).__name__}' is not supported for defined type inference." + "Only Scalars and Arrays are expected for Kernels.") + + +def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool: + """ + Checks if the given node is enclosed within a Map whose schedule type + matches any in the `schedules` set. + + Parameters + ---------- + state : SDFGState + The State where the node resides + node : nodes.Node + The node to check. + schedules : set[dtypes.ScheduleType] + A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}). + + Returns + ---------- + bool + True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise. + """ + current = node + + while current is not None: + if isinstance(current, nodes.MapEntry): + if current.map.schedule in schedules: + return True + + parent = get_parent_map(state, current) + if parent is None: + return False + current, state = parent diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py new file mode 100644 index 0000000000..800b6ab4c8 --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -0,0 +1,550 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from abc import ABC, abstractmethod + +import dace +from dace import dtypes, subsets, symbolic +from dace.sdfg import SDFG, ScopeSubgraphView, nodes, SDFGState +from dace.sdfg.state import ControlFlowRegion +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.dispatcher import DefinedType, TargetDispatcher +from dace.transformation import helpers +from dace.codegen.targets.cpp import sym2cpp +from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, KernelSpec +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import (get_cuda_dim, product) + +#---------------------------------------------------------------------------------- +# GPU Scope Generation Strategies +#---------------------------------------------------------------------------------- + + +class ScopeGenerationStrategy(ABC): + """Base strategy for generating GPU scope code""" + + def __init__(self, codegen: ExperimentalCUDACodeGen): + self.codegen: ExperimentalCUDACodeGen = codegen + self._dispatcher: TargetDispatcher = codegen._dispatcher + self._current_kernel_spec: KernelSpec = codegen._current_kernel_spec + + @abstractmethod + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + raise NotImplementedError('Abstract class') + + +class KernelScopeGenerator(ScopeGenerationStrategy): + + def __init__(self, codegen: ExperimentalCUDACodeGen): + super().__init__(codegen) + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + + node = dfg_scope.source_nodes()[0] + schedule_type = node.map.schedule + + # This strategy starts kernel code generation and is only valid if + # the outermost (first) GPU schedule is of type GPU_Device. + applicable = schedule_type == dtypes.ScheduleType.GPU_Device + return applicable + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + # Generate kernel function signature + self._generate_kernel_signature(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + # Generate kernel body + with ScopeManager(frame_codegen=self.codegen._frame, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id, + function_stream=function_stream, + callsite_stream=callsite_stream, + comment="Kernel scope") as scope_manager: + + # ----------------- Retrieve kernel configuration ----------------------- + + kernel_spec = self._current_kernel_spec + kernel_entry_node = kernel_spec._kernel_map_entry # == dfg_scope.source_nodes()[0] + kernel_map = kernel_spec.kernel_map + + # ----------------- Kernel/Map Range Preprocessing ----------------------- + + reversed_kernel_range = kernel_map.range[::-1] # also reverse it + kernel_range = subsets.Range(reversed_kernel_range) + kernel_dimensions = len(kernel_range) + kernel_dim_sizes = kernel_range.size() + + # ----------------- Set up symbolic index expressions ----------------------- + + symbolic_indices = [ + symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions) + ] + symbolic_coordinates = kernel_range.coord_at(symbolic_indices) + + # ----------------- Generate Thread or Block index Definitions ----------------------- + + thread_id_ctype = kernel_spec.gpu_index_ctype # Data type of CUDA thread/block indices + + # In case there is no ThreadBlock map used in a submap, the map variables will + # be mapped to thread IDs instead of block IDs + for dim in range(kernel_dimensions): + + var_name = kernel_map.params[-dim - 1] # also reverse it here! + + # Compute index expressions for up to 3 dimensions (x, y, z) + if dim < 3: + index_expr = f'blockIdx.{get_cuda_dim(dim)}' + # Delinearize third dimension if more than 3D (used in 3D+ mapping) + if dim == 2 and kernel_dimensions > 3: + tail_prod = product(kernel_dim_sizes[3:]) + index_expr = f"({index_expr} / ({sym2cpp(tail_prod)}))" + + else: # Handle dimensions beyond the third (delinearize and modulo) + index_expr = f'blockIdx.z' + tail_prod = product(kernel_dim_sizes[dim + 1:]) + index_expr = (f"(({index_expr} / ({sym2cpp(tail_prod)})) % ({sym2cpp(kernel_dim_sizes[dim])}))") + + # Define thread/Block index + var_def = sym2cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) + callsite_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) + + self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream, callsite_stream) + + # ----------------- Dispatch Subgraph code generation ----------------------- + + self._dispatcher.dispatch_subgraph(sdfg, + cfg, + dfg_scope, + state_id, + function_stream, + callsite_stream, + skip_entry_node=True) + + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream, + callsite_stream) + + def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + kernel_name = self._current_kernel_spec.kernel_name + kernel_args = self._current_kernel_spec.args_typed + block_dims = self._current_kernel_spec.block_dims + node = dfg_scope.source_nodes()[0] + + # Conditionally add __launch_bounds__ for block size optimization. + launch_bounds = '' + if node.gpu_launch_bounds != '-1': + if node.gpu_launch_bounds == "0": + if not any(symbolic.issymbolic(b) for b in block_dims): + launch_bounds = f'__launch_bounds__({product(block_dims)})' + else: + launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})' + + # Emit kernel function signature + callsite_stream.write(f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', cfg, + state_id, node) + + +class ThreadBlockScopeGenerator(ScopeGenerationStrategy): + + def __init__(self, codegen: ExperimentalCUDACodeGen): + super().__init__(codegen) + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + + node = dfg_scope.source_nodes()[0] + applicable = node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock + + return applicable + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + # NOTE: not my code, but my insights. Approval for commenting this needed + with ScopeManager(frame_codegen=self.codegen._frame, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id, + function_stream=function_stream, + callsite_stream=callsite_stream, + comment="ThreadBlock Scope") as scope_manager: + + node = dfg_scope.source_nodes()[0] + scope_map = node.map + + # ----------------- Map Range Preprocessing ----------------------- + + # Reverse range for better performance (e.g. memory coalescing) + reversed_scope_range = scope_map.range[::-1] + map_range = subsets.Range(reversed_scope_range) + map_dimensions = len(map_range) + map_dim_sizes = map_range.size() + + kernel_block_dims = self._current_kernel_spec.block_dims + + # ----------------- Symbolic Index Expressions ----------------------- + + symbolic_indices = [ + symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(map_dimensions) + ] + symbolic_index_bounds = [ + idx + (block_dim * rng[2]) - 1 + for idx, block_dim, rng in zip(symbolic_indices, kernel_block_dims, map_range) + ] + symbolic_coordinates = map_range.coord_at(symbolic_indices) + + # ----------------- Generate Index Variable Definitions ----------------------- + + # Get the block's index dace data type + block_id_ctype = self._current_kernel_spec.gpu_index_ctype + + for dim in range(map_dimensions): + var_name = scope_map.params[-dim - 1] # also reverse it here! + + if dim < 3: + # First three dimensions: direct mapping or partial delinearization + if dim == 2 and map_dimensions > 3: + tail_prod = product(map_dim_sizes[3:]) + base_expr = f"(threadIdx.z / ({sym2cpp(tail_prod)}))" + else: + base_expr = f"threadIdx.{get_cuda_dim(dim)}" + else: + # Dimensions beyond the third: full delinearization + tail_prod = product(map_dim_sizes[dim + 1:]) + base_expr = (f"((threadIdx.z / ({sym2cpp(tail_prod)})) % ({sym2cpp(map_dim_sizes[dim])}))") + + var_def = sym2cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) + callsite_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) + + self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) + + # ----------------- Guard Conditions for Block Execution ----------------------- + + # Generate conditions for this block's execution using min and max + # element, e.g. skipping out-of-bounds threads in trailing block + minels = map_range.min_element() + maxels = map_range.max_element() + for dim, (var_name, start, end) in enumerate(zip(scope_map.params[::-1], minels, maxels)): + + # Optimize conditions if they are always true + ############################################# + + condition = '' + + # Block range start + if dim >= 3 or (symbolic_indices[dim] >= start) != True: + condition += f'{var_name} >= {sym2cpp(start)}' + + # Special case: block size is exactly the range of the map (0:b) + if dim >= 3: + skipcond = False + else: + skipcond = symbolic_index_bounds[dim].subs({symbolic_indices[dim]: start}) == end + + # Block range end + if dim >= 3 or (not skipcond and (symbolic_index_bounds[dim] < end) != True): + if len(condition) > 0: + condition += ' && ' + condition += f'{var_name} < {sym2cpp(end + 1)}' + + # Emit condition in code if any + if len(condition) > 0: + scope_manager.open(condition=condition) + + # ----------------- Dispatch Subgraph code generation ----------------------- + + self._dispatcher.dispatch_subgraph(sdfg, + cfg, + dfg_scope, + state_id, + function_stream, + callsite_stream, + skip_entry_node=True) + + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) + + +class WarpScopeGenerator(ScopeGenerationStrategy): + + def __init__(self, codegen: ExperimentalCUDACodeGen): + super().__init__(codegen) + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + + node = dfg_scope.source_nodes()[0] + applicable = node.map.schedule == dtypes.ScheduleType.GPU_Warp + + return applicable + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + with ScopeManager(frame_codegen=self.codegen._frame, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id, + function_stream=function_stream, + callsite_stream=callsite_stream, + comment="WarpLevel Scope") as scope_manager: + + # Get kernel specifications + kernel_spec = self._current_kernel_spec + block_dims = kernel_spec.block_dims + warpSize = kernel_spec.warpSize + + state_dfg = cfg.state(state_id) + node = dfg_scope.source_nodes()[0] + scope_map = node.map + + map_range = subsets.Range(scope_map.range[::-1]) # Reversed for potential better performance + warp_dim = len(map_range) + + # The following sizes and bounds are be symbolic + num_threads_in_block = product(block_dims) + warp_dim_bounds = [max_elem + 1 for max_elem in map_range.max_element()] + num_warps = product(warp_dim_bounds) + + # The C type used to define the (flat) threadId and warpId variables + ids_ctype = kernel_spec.gpu_index_ctype + + # ----------------- Guard checks ----------------------- + + # handles checks either at compile time or runtime (i.e. checks in the generated code) + self._handle_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps, + callsite_stream, scope_manager) + + # ----------------- Define (flat) Thread ID within Block ----------------------- + + flattened_terms = [] + + for i, dim_size in enumerate(block_dims): + + if dim_size == 1: + continue + + dim = get_cuda_dim(i) + stride = [f"{block_dims[j]}" for j in range(i) if block_dims[j] > 1] + idx_expr = " * ".join(stride + [f"threadIdx.{get_cuda_dim(i)}"]) if stride else f"threadIdx.{dim}" + flattened_terms.append(idx_expr) + + joined_terms = " + ".join(flattened_terms) + flat_thread_idx_expr = f"({joined_terms})" if len(flattened_terms) > 1 else joined_terms + + threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, + state_dfg.node_id(node)) + + callsite_stream.write(f"{ids_ctype} {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, + state_id, node) + self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, ids_ctype) + + # ----------------- Compute Map indices (= Warp indices) ----------------------- + + for i in range(warp_dim): + var_name = scope_map.params[-i - 1] # reverse order + previous_sizes = warp_dim_bounds[:i] + + if len(previous_sizes) > 0: + divisor = product(previous_sizes) + expr = f"(({threadID_name} / {divisor}) % ({warp_dim_bounds[i]}))" + else: + expr = f"({threadID_name} % ({warp_dim_bounds[i]}))" + + callsite_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) + + self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) + + # ----------------- Guard Conditions for Warp Execution ----------------------- + + if num_warps * warpSize != num_threads_in_block: + condition = f'{threadID_name} < {num_warps}' + scope_manager.open(condition) + + warp_range = [(start, end + 1, stride) for start, end, stride in map_range.ranges] + + for dim, (var_name, (start, _, stride)) in enumerate(zip(scope_map.params[::-1], warp_range)): + + condition_terms = [] + + if start != 0: + condition_terms.append(f"{var_name} >= {start}") + + if stride != 1: + expr = var_name if start == 0 else f"({var_name} - {start})" + condition_terms.append(f'{expr} % {stride} == 0') + + if condition_terms: + condition = " && ".join(condition_terms) + scope_manager.open(condition) + + # ----------------- Dispatch Subgraph code generation ----------------------- + + self._dispatcher.dispatch_subgraph(sdfg, + cfg, + dfg_scope, + state_id, + function_stream, + callsite_stream, + skip_entry_node=True) + + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) + + def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, + warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream, + scope_manager: 'ScopeManager'): + + #TODO: Move them to sdfg validation as well if possible + + # Get warpSize from the kernel specification + warpSize = self._current_kernel_spec.warpSize + + parent_map, _ = helpers.get_parent_map(state_dfg, node) + if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: + raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") + + if warp_dim > 3: + raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.") + + # Guard against invalid thread/block configurations. + # - For concrete (compile-time) values, raise Python errors early. + # - For symbolic values, insert runtime CUDA checks (guards) into the generated kernel. + # These will emit meaningful error messages and abort execution if violated. + if isinstance(num_threads_in_block, symbolic.symbol): + condition = (f"{num_threads_in_block} % {warpSize} != 0 || " + f"{num_threads_in_block} > 1024 || " + f"{num_warps} * {warpSize} > {num_threads_in_block}") + kernel_stream.write(f"""\ + if ({condition}) {{ + printf("CUDA error:\\n" + "1. Block must be a multiple of {warpSize} threads (DaCe requirement for GPU_Warp scheduling).\\n" + "2. Block size must not exceed 1024 threads (CUDA hardware limit).\\n" + "3. Number of warps x {warpSize} must fit in the block (otherwise logic is unclear).\\n"); + asm("trap;"); + }} + """) + + else: + if isinstance(num_warps, symbolic.symbol): + condition = f"{num_warps} * {warpSize} > {num_threads_in_block}" + scope_manager.open(condition=condition) + + elif num_warps * warpSize > num_threads_in_block: + raise ValueError(f"Invalid configuration: {num_warps} warps x {warpSize} threads exceed " + f"{num_threads_in_block} threads in the block.") + + if num_threads_in_block % warpSize != 0: + raise ValueError(f"Block must be a multiple of {warpSize} threads for GPU_Warp scheduling " + f"(got {num_threads_in_block}).") + + if num_threads_in_block > 1024: + raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") + + for min_element in map_range.min_element(): + if isinstance(min_element, symbolic.symbol): + kernel_stream.write( + f'if ({min_element} < 0) {{\n' + f' printf("Runtime error: Warp ID symbol {min_element} must be non-negative.\\n");\n' + f' asm("trap;");\n' + f'}}\n') + elif min_element < 0: + raise ValueError(f"Warp ID value {min_element} must be non-negative.") + + +#---------------------------------------------------------------------------------- +# Scope Manager, handling brackets and allocation/deallocation of arrays in Scopes +#---------------------------------------------------------------------------------- + + +class ScopeManager: + """ + A helper class to manage opening and closing brackets in a structured way using the 'with' statement. + This class simplifies the process of correctly opening and closing brackets. It also supports an optional + debug mode to include comments in the generated code, which can help with debugging and understanding + the code structure. + """ + + def __init__(self, + frame_codegen: DaCeCodeGenerator, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg_scope: ScopeSubgraphView, + state_id: int, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + comment: str = None, + brackets_on_enter: bool = True, + debug: bool = False): + """ + Initializes the KernelScopeManager. + + :param frame_codegen: The frame codegenerator used for allocation and deallocation of arrays in scopes + :param sdfg: The SDFG instance for context. + :param cfg: The ControlFlowRegion instance for context. + :param dfg_scope: The ScopeSubgraphView instance for context. + :param state_id: The ID of the current state for context. + :param function_stream: The CodeIOStream for function-level code. + :param callsite_stream: The CodeIOStream for callsite-level code. + :param comment: A descriptive comment explaining the purpose of the code block being opened. Default is None. + :param brackets_on_enter: Whether on entering (i.e. when using "with", there should be a bracket opened). Default is True. + :param debug: Whether to include debug comments in the output. Defaults to False. + """ + self.frame_codegen = frame_codegen + self.sdfg = sdfg + self.cfg = cfg + self.dfg_scope = dfg_scope + self.state_id = state_id + self.function_stream = function_stream + self.callsite_stream = callsite_stream + self.comment = comment + self.brackets_on_enter = brackets_on_enter + self.debug = debug + self._opened = 0 + + self.entry_node = self.dfg_scope.source_nodes()[0] + self.exit_node = self.dfg_scope.sink_nodes()[0] + + def __enter__(self): + """ + Writes the opening bracket in case self.brackets_on_enter + is set to true, which it is by default. + """ + if self.brackets_on_enter: + self.open() + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Writes the closing brackets to the stream. + """ + for i in range(self._opened): + line = "}" + if self.debug: + line += f" // {self.comment} (close {i + 1})" + self.callsite_stream.write(line, self.cfg, self.state_id, self.exit_node) + + def open(self, condition: str = None): + """ + Opens a bracket. If a condition is given, emits 'if (condition) {', otherwise just '{'. + Tracks the number of open brackets for closing later. + + :param condition: Optional condition for the opening bracket. + """ + line = f"if ({condition}) {{" if condition else "{" + if self.debug: + line += f" // {self.comment} (open {self._opened + 1})" + self.callsite_stream.write(line, self.cfg, self.state_id, self.entry_node) + self._opened += 1 diff --git a/dace/codegen/targets/gpu_helpers/copy_strategies.py b/dace/codegen/targets/gpu_helpers/copy_strategies.py new file mode 100644 index 0000000000..27a5b2c53b --- /dev/null +++ b/dace/codegen/targets/gpu_helpers/copy_strategies.py @@ -0,0 +1,553 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple, Union + +from dace import SDFG, SDFGState, data, dtypes, subsets +from dace import memlet as mm +from dace import symbolic +from dace.codegen import common +from dace.codegen.targets import cpp +from dace.codegen.targets.cpp import sym2cpp +from dace.codegen.targets.gpu_helpers.gpu_utils import generate_sync_debug_call +from dace.config import Config +from dace.dtypes import StorageType +from dace.frontend import operations +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import helpers + + +class CopyContext: + """ + Encapsulates inputs required for copy operations and exposes helper + methods to derive additional information. This keeps copy strategies + lightweight by letting them focus only on the relevant logic. + """ + + def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]): + + # Store the basic context as attributes + self.sdfg = sdfg + self.state = state + self.src_node = src_node + self.dst_node = dst_node + self.edge = edge + self.gpustream_assignments = gpustream_assignments + + memlet = edge.data + + self.copy_shape = memlet.subset.size_exact() + if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): + copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info() + else: + copy_shape = memlet.subset.size_exact() + src_strides = dst_strides = src_expr = dst_expr = None + + self.copy_shape = copy_shape + self.src_strides = src_strides + self.dst_strides = dst_strides + self.src_expr = src_expr + self.dst_expr = dst_expr + + def get_storage_type(self, node: nodes.Node): + """ + Return the storage type associated with a given SDFG node. + + Tasklets are assumed to use register storage, while AccessNodes + return the storage type from their data descriptor. Raises + NotImplementedError for unsupported node types. + """ + if isinstance(node, nodes.Tasklet): + storage_type = StorageType.Register + + elif isinstance(node, nodes.AccessNode): + storage_type = node.desc(self.sdfg).storage + + else: + raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; " + "expected AccessNode or Tasklet. Please extend this method accordingly.") + + return storage_type + + def get_assigned_gpustream(self) -> str: + """ + Return the GPU stream expression assigned to both source and destination nodes. + + Ensures that both nodes have a matching stream ID, then constructs the + variable name from the configured prefix and stream ID. Raises ValueError + if assignments are missing or inconsistent. + + Example: + If the configured prefix is 'gpu_stream' and the assigned stream ID is 0, + this method returns 'gpu_stream0'. + """ + src_stream = self.gpustream_assignments.get(self.src_node) + dst_stream = self.gpustream_assignments.get(self.dst_node) + + # 1. Catch unsupported cases + if src_stream is None or dst_stream is None: + raise ValueError("GPU stream assignment missing for source or destination node.") + + if src_stream != dst_stream: + raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', " + f"dst_node has '{dst_stream}'. They must be the same.") + + # 2. Generate GPU stream expression + gpustream = src_stream + gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}" + + return gpustream_expr + + def get_memory_location(self) -> Tuple[str, str]: + """ + Determine whether the source and destination nodes reside in device or host memory. + + Uses the storage type of each node to classify it as either 'Device' + (GPU global memory) or 'Host' (all other storage types). + Used for GPU related copies outside the kernel (e.g. to construct + cudaMemcpyHostToDevice for example). + + Returns + ------- + Tuple[str, str] + (src_location, dst_location) where each is either 'Device' or 'Host'. + """ + src_storage = self.get_storage_type(self.src_node) + dst_storage = self.get_storage_type(self.dst_node) + src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + return src_location, dst_location + + def get_ctype(self) -> Any: + """ + Determine the C data type (ctype) of the source or destination node. + + The ctype is resolved from the data descriptor of the first node + (source or destination) that is an AccessNode (assumed to be the same + if both are AccessNodes). + + Returns + ------- + Any + The C type string (e.g., "float*", "int32") associated with the node. + + Raises + ------ + NotImplementedError + If neither the source nor the destination node is an AccessNode. + """ + sdfg = self.sdfg + src_node, dst_node = self.src_node, self.dst_node + + if isinstance(src_node, nodes.AccessNode): + return src_node.desc(sdfg).ctype + + if isinstance(dst_node, nodes.AccessNode): + return dst_node.desc(sdfg).ctype + + raise NotImplementedError( + f"Cannot determine ctype: neither src nor dst node is an AccessNode. " + f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. " + "Please extend this case or fix the issue.") + + def get_accessnode_to_accessnode_copy_info(self): + """ + Compute copy shape, absolute strides, and pointer expressions for a copy + between two AccessNodes. Tries to mimic + cpp.memlet_copy_to_absolute_strides without requiring a dispatcher. + + Returns + ------- + (copy_shape, src_strides, dst_strides, src_expr, dst_expr) + + Raises + ------ + TypeError + If either endpoint is not an AccessNode. + NotImplementedError + If a descriptor is not Scalar or Array. + """ + + # ---------------------------- helpers ---------------------------- + def _collapse_strides(strides, subset): + """Remove size-1 dims; keep tile strides; default to [1] if none remain.""" + n = len(subset) + collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1] + collapsed.extend(strides[n:]) # include tiles + if len(collapsed) == 0: + return [1] + return collapsed + + def _ptr_name(desc, name): + if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External): + return f'__state->__{sdfg.cfg_id}_{name}' + return name + + def _expr_for(desc, name, subset): + ptr = _ptr_name(desc, name) + + if isinstance(desc, data.Scalar): + # GPU scalar special-case + if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: + parent = state.sdfg.parent_nsdfg_node + if parent is not None and name in parent.in_connectors: + return f"&{ptr}" + return ptr + # CPU (or other) scalars + return f"&{ptr}" + + if isinstance(desc, data.Array): + offset = cpp.cpp_offset_expr(desc, subset) + return f"{ptr} + {offset}" if offset != "0" else ptr + + raise NotImplementedError( + f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.") + + # ---------------------------- Get copy info ---------------------------- + # Get needed information + src_node, dst_node = self.src_node, self.dst_node + sdfg, edge, state = self.sdfg, self.edge, self.state + memlet, copy_shape = self.edge.data, self.copy_shape + + # Guard - only applicable if src and dst are AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + raise TypeError( + f"get_accessnode_to_accessnode_copy_info requires both source and destination " + f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.") + + # Get node descriptors + src_nodedesc = src_node.desc(sdfg) + dst_nodedesc = dst_node.desc(sdfg) + + # Resolve subsets (fallback to full range) + src_subset = memlet.get_src_subset(edge, state) + dst_subset = memlet.get_dst_subset(edge, state) + + if src_subset is None: + src_subset = subsets.Range.from_array(src_nodedesc) + + if dst_subset is None: + dst_subset = subsets.Range.from_array(dst_nodedesc) + + # Get strides + src_strides = src_subset.absolute_strides(src_nodedesc.strides) + dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) + + # Try to convert to a degenerate/strided ND copy first + result = cpp.ndcopy_to_strided_copy( + copy_shape, + src_nodedesc.shape, + src_strides, + dst_nodedesc.shape, + dst_strides, + memlet.subset, + src_subset, + dst_subset, + ) + + if result is not None: + copy_shape, src_strides, dst_strides = result + else: + src_strides = _collapse_strides(src_strides, src_subset) + dst_strides = _collapse_strides(dst_strides, dst_subset) + copy_shape = [s for s in copy_shape if s != 1] or [1] + + # Extend copy shape to the largest among the data dimensions, + # and extend other array with the appropriate strides + if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape): + if memlet.data == src_node.data: + copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape) + elif memlet.data == dst_node.data: + copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + + # Build final expressions + src_expr = _expr_for(src_nodedesc, src_node.data, src_subset) + dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset) + + return copy_shape, src_strides, dst_strides, src_expr, dst_expr + + +class CopyStrategy(ABC): + """Abstract base class for memory copy strategies.""" + + @abstractmethod + def applicable(self, copy_context: CopyContext) -> bool: + """ + Return True if this strategy can handle the given memory copy. + """ + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate_copy(self, copy_context: CopyContext) -> str: + """ + Generates and returns the copy code for the supported pattern. + """ + raise NotImplementedError('Abstract class') + + +class OutOfKernelCopyStrategy(CopyStrategy): + """ + Copy strategy for memory transfers that occur outside of kernel execution. + + This pattern often occurs when generating host-to-device copies for kernel inputs + (since kernels cannot access host memory directly), and device-to-host copies + to retrieve results for further processing. + """ + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Determines whether the data movement is a host<->device memory copy. + + This function returns True if: + - We are not currently generating kernel code + - The copy occurs between two AccessNodes + - The data descriptors of source and destination are not views. + - The storage types of either src or dst is CPU_Pinned or GPU_Device + - We do not have a CPU-to-CPU copy + """ + # Retrieve needed information + state = copy_context.state + src_node, dst_node = copy_context.src_node, copy_context.dst_node + + # 1. Ensure copy is not occuring within a kernel + scope_dict = state.scope_dict() + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + + parent_map_tuple = helpers.get_parent_map(state, deeper_node) + while parent_map_tuple is not None: + parent_map, parent_state = parent_map_tuple + if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return False + else: + parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) + + # 2. Check whether copy is between two AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False + + # 3. The data descriptors of source and destination are not views + if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): + return False + + # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) + or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): + return False + + # 5. Check that this is not a CPU to CPU copy + cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] + if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: + return False + + return True + + def generate_copy(self, copy_context: CopyContext) -> str: + """Execute host-device copy with CUDA memory operations""" + + # Guard + memlet = copy_context.edge.data + if memlet.wcr is not None: + src_location, dst_location = copy_context.get_memory_location() + raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') + + # Based on the copy dimension, call appropiate helper function + num_dims = len(copy_context.copy_shape) + if num_dims == 1: + copy_call = self._generate_1d_copy(copy_context) + + elif num_dims == 2: + copy_call = self._generate_2d_copy(copy_context) + + else: + # sanity check + assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." + copy_call = self._generate_nd_copy(copy_context) + + return copy_call + + def _generate_1d_copy(self, copy_context: CopyContext) -> str: + """ + Generates a 1D memory copy between host and device using the GPU backend. + + Uses {backend}MemcpyAsync for contiguous memory. For strided memory, + {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension. + """ + # ----------- Retrieve relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call -------------------- + + if is_contiguous_copy: + # Memory is linear: can use {backend}MemcpyAsync + copysize = ' * '.join(sym2cpp(copy_shape)) + copysize += f' * sizeof({ctype})' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n' + + else: + # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch + # This allows copying a strided 1D region + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call + + def _generate_2d_copy(self, copy_context: CopyContext) -> None: + """ + Generates a 2D memory copy using {backend}Memcpy2DAsync. + + Three main cases are handled: + - Copy between row-major stored arrays with contiguous rows. + - Copy between column-major stored arrays with contiguous columns. + - A special case where a 2D copy can still be represented. + + Raises: + NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns. + Such cases indicate an unsupported 2D copy and should be examined separately. + They can be implemented if valid, or a more descriptive error should be raised if the path should not occur. + + Note: + {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column), + but not both simultaneously. + """ + + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call if supported -------------------- + + # Case: Row-major layout, rows are not strided. + if (src_strides[1] == 1) and (dst_strides[1] == 1): + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[0])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Case: Column-major layout, no columns are strided. + elif (src_strides[0] == 1) and (dst_strides[0] == 1): + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[1])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Special case + elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]): + # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with + # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a + # {backend}Memcpy2DAsync call! + + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0] * copy_shape[1]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + else: + raise NotImplementedError( + f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." + "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." + ) + + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call + + def _generate_nd_copy(self, copy_context: CopyContext) -> None: + """ + Generates GPU code for copying N-dimensional arrays using 2D memory copies. + + Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops + for any outer dimensions. Expects the copy to be contiguous and between + row-major storage locations. + """ + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + num_dims = len(copy_shape) + + # ----------- Guard for unsupported Pattern -------------- + if not (src_strides[-1] == 1) and (dst_strides[-1] == 1): + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + raise NotImplementedError( + "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n" + f" Source node: {src_node} (storage: {src_storage})\n" + f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" + f" Source strides: {src_strides}\n" + f" Destination strides: {dst_strides}\n" + f" copy shape: {copy_shape}\n") + + # ----------------- Generate and write backend call(s) -------------------- + + call = "" + # Write for-loop headers + for dim in range(num_dims - 2): + call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" + + # Write Memcopy2DAsync + offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2])) + + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' + + dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})' + height = sym2cpp(copy_shape[-2]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + # Generate call and write it + call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Potentially snychronization required if syncdebug is set to true in configurations + call += generate_sync_debug_call() + + # Write for-loop footers + for dim in range(num_dims - 2): + call += "\n}" + + # Return the code + return call diff --git a/dace/codegen/targets/gpu_helpers/gpu_utils.py b/dace/codegen/targets/gpu_helpers/gpu_utils.py new file mode 100644 index 0000000000..e4c4c1fc38 --- /dev/null +++ b/dace/codegen/targets/gpu_helpers/gpu_utils.py @@ -0,0 +1,27 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from dace import Config +from dace.codegen import common + + +def generate_sync_debug_call() -> str: + """ + Generate backend sync and error-check calls as a string if + synchronous debugging is enabled. + + Parameters + ---------- + backend : str + Backend API prefix (e.g., 'cuda'). + + Returns + ------- + str + The generated debug call code, or an empty string if debugging is disabled. + """ + backend: str = common.get_gpu_backend() + sync_call: str = "" + if Config.get_bool('compiler', 'cuda', 'syncdebug'): + sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") + + return sync_call diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 72e1f784f9..71cf433eb0 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -326,7 +326,7 @@ required: Additional CUDA architectures (separated by commas) to compile GPU code for, excluding the current architecture on the compiling machine. - default: '60' + default: '86' hip_arch: type: str @@ -403,9 +403,9 @@ required: type: bool title: Synchronous Debugging description: > - Enables Synchronous Debugging mode, where each library call - is followed by full-device synchronization and error checking. - default: false + Enables debugging mode where each asynchronous GPU call is followed by + device-wide synchronization and error checking. + default: False libs: type: str @@ -454,17 +454,81 @@ required: index types are needed to address memory offsets that are beyond the 32-bit range, or to reduce memory usage. - allow_implicit_memlet_to_map: - type: bool - title: Allow the implicit conversion of Memlets to Maps during code generation. - default: true + # New configs, needed for ExperimentalCUDACodeGen + implementation: + type: str + title: CUDA codegen implementation + description: > + Choose between available CUDA code generation implementations. + "legacy" is stable, "experimental" is used by Berkay Aydogdu and + Yakup Koray Budanaz for Berkays master-thesis. + enum: [legacy, experimental] + default: experimental + + gpu_index_type: + type: str + title: Thread/block/warp index data type + default: int32 + description: > + Defines the data type for a thread, block and warp index in the generated code. + The type is based on the type-classes in ``dace.dtypes``. For example, + ``uint64`` is equivalent to ``dace.uint64``. Change this setting when large + index types are needed to address memory offsets that are beyond the 32-bit + range, or to reduce memory usage. This replaces ``thread_id_type`` in + ``ExperimentalCUDACodeGen`` , as the new name more accurately reflects its broader + usage. + + cuda_warp_size: + type: int + title: CUDA warp size + description: > + Defines the warp size used during CUDA code generation. The default and current + standard value for CUDA is 32. This should only be changed if future CUDA + architectures explicitly alter the warp size. Modifying this value arbitrarily may + result in incorrect or unknown behavior, and is therefore strongly discouraged. + default: 32 + + hip_warp_size: + type: int + title: HIP warp size description: > - If ``true`` the code generator will implicitly convert Memlets that cannot be - represented by a native library call, such as ``cudaMemcpy()`` into Maps that - explicitly copy the data around. If this value is ``false`` the code generator - will raise an exception if such a Memlet is encountered. This allows the user - to have full control over all Maps in the SDFG. + Specifies the warp size (also known as wavefront size) for HIP code generation. + The default value for AMD GPUs is typically 64. This setting should only be modified + if you have a clear understanding of what you are doing. + default: 64 + auto_syncthreads_insertion: + type: bool + title: Insert Default __syncthreads() Tasklets + description: > + If enabled, inserts default __syncthreads() tasklets during preprocessing + in ExperimentalCUDACodeGen to ensure shared memory is ready before access. + This is a simple safeguard for correctness—it may not be complete, but it + does the job for basic SDFGs. Disable if you handle synchronization manually + or use other mechanisms like async copies or pipelines. + default: True + + current_thread_block_name: + type: str + title: Variable name for the current thread block + description: > + Specifies the name of the variable that holds the current thread block group, + initialized using `cooperative_groups::this_thread_block()`. This is useful in + contexts like custom tasklets, where the variable is explicitly referenced + (e.g., `cooperative_groups::wait(block)`). Setting this allows users to customize the + variable name without modifying the source code or relying on a fixed name. + default: block + + gpu_stream_name: + type: str + title: Name for the GPU stream object + description: > + GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously + and in parallel. This field specifies the naming convention for the hpu stream array and its connectors + in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the + stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a + connector for gpu_streams[0]. + default: gpu_streams,gpu_stream ############################################# # General FPGA flags diff --git a/dace/dtypes.py b/dace/dtypes.py index faadc84a50..b11c8b3bb1 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -77,6 +77,7 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping + GPU_Warp = () # A subset of GPU schedule types @@ -87,6 +88,19 @@ class ScheduleType(aenum.AutoNumberEnum): ScheduleType.GPU_Persistent, ] +# A subset of GPU schedule types for ExperimentalCUDACodeGen +GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN = [ + ScheduleType.GPU_Device, + ScheduleType.GPU_ThreadBlock, + ScheduleType.GPU_Warp, +] + +# A subset of on-GPU storage types for ExperimentalCUDACodeGen +GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN = [ + StorageType.GPU_Global, + StorageType.GPU_Shared, +] + # A subset of CPU schedule types CPU_SCHEDULES = [ ScheduleType.CPU_Multicore, @@ -204,7 +218,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register, ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, - ScheduleType.Snitch: StorageType.Snitch_TCDM + ScheduleType.Snitch: StorageType.Snitch_TCDM, + ScheduleType.GPU_Warp: StorageType.Register, } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -225,7 +240,8 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device, ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, - ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore + ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, + ScheduleType.GPU_Warp: ScheduleType.Sequential, } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. @@ -1266,6 +1282,7 @@ def isconstant(var): complex128 = typeclass(numpy.complex128) string = stringtype() MPI_Request = opaque('MPI_Request') +gpuStream_t = opaque('gpuStream_t') @undefined_safe_enum @@ -1286,6 +1303,7 @@ class Typeclasses(aenum.AutoNumberEnum): float64 = float64 complex64 = complex64 complex128 = complex128 + gpuStream_t = gpuStream_t _bool = bool diff --git a/dace/registry.py b/dace/registry.py index 08efeb65ed..bab0fa4ade 100644 --- a/dace/registry.py +++ b/dace/registry.py @@ -37,6 +37,23 @@ def autoregister(cls: Type, **kwargs): that automatically registers the subclass with the superclass registry upon creation. """ + # Ensures that the correct CUDA implementation is selected and the other is not registered. + # Registering both leads to errors. + from dace.config import Config + + name = kwargs.get('name') + impl = Config.get('compiler', 'cuda', 'implementation') + + valid_impls = {'legacy', 'experimental'} + if impl not in valid_impls: + raise ValueError(f"Invalid CUDA implementation: {impl}. " + f"Please select one of {valid_impls} under compiler.cuda.implementation in the configs.") + + # Only the CUDA implementation selected in Config is registered + if name in {'cuda', 'experimental_cuda'}: + if (impl == 'experimental' and name == 'cuda') or (impl == 'legacy' and name == 'experimental_cuda'): + return + registered = False for base in cls.__bases__: if hasattr(base, '_registry_') and hasattr(base, 'register'): diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 31ab055b48..3e7a1e450d 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -285,6 +285,16 @@ class AccessNode(Node): instrument_condition = CodeProperty(desc="Condition under which to trigger the instrumentation", default=CodeBlock("1", language=dtypes.Language.CPP)) + # Experimental-CUDA-specific properties + async_copy = Property(dtype=bool, + desc="Marks the data copy to this node (if any) as asynchronous (CUDA-specific).", + default=False) + + async_pipeline = Property(dtype=str, + desc="Name of the CUDA pipeline responsible for synchronization. " + "Only relevant if async_copy is True. May be None.", + allow_none=True) + def __init__(self, data, debuginfo=None): super(AccessNode, self).__init__() @@ -312,6 +322,9 @@ def __deepcopy__(self, memo): node._guid = graph.generate_element_id(node) + node._async_copy = self._async_copy + node._async_pipeline = self._async_pipeline + return node @property @@ -933,6 +946,9 @@ def used_symbols_within_scope(self, parent_state: 'dace.SDFGState', all_symbols: free_symbols |= e.data.used_symbols(all_symbols, e) + # Update with the symbols needed by the map + free_symbols |= self.free_symbols + # Do not consider SDFG constants as symbols new_symbols.update(set(parent_sdfg.constants.keys())) return free_symbols - new_symbols diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index bda9d8707e..d37ef6dae1 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -505,6 +505,8 @@ class SDFG(ControlFlowRegion): default=False, desc="Whether the SDFG contains explicit control flow constructs") + metadata = Property(dtype=dict, desc="Metada attached to the SDFG", default=None, allow_none=True) + def __init__(self, name: str, constants: Dict[str, Tuple[dt.Data, Any]] = None, @@ -597,6 +599,9 @@ def __deepcopy__(self, memo): if fixed: warnings.warn(f'Fixed {fixed} nested SDFG parent references during deep copy.') + # copy metadata + result._metadata = copy.deepcopy(self._metadata, memo) + return result @property diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index d558053d3d..2f656111f2 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -405,6 +405,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()): return result + # For the (new) gpu stream handling we can have dynamic out connectors, e.g. + # KernelExit: stream -> None: AccessNode, where AccessNode accesses a Stream array + # Memlets are used but its not about seing how data flows + if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device + and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t): + return result + # Prepend incoming edges until reaching the source node curedge = edge visited = set() diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 2cb66bc765..0d07b2f3e5 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -356,6 +356,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context "Arrays that use a multibank access pattern must have the size of the first dimension equal" f" the number of banks and have at least 2 dimensions for array {name}", sdfg, None) + # Check for interstate edges that write to scalars or arrays + _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg) + # Check if SDFG is located within a GPU kernel context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None) context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None) @@ -379,6 +382,17 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context raise +def _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg: 'dace.sdfg.SDFG'): + from dace.sdfg import InterstateEdge + for edge, graph in sdfg.all_edges_recursive(): + if edge.data is not None and isinstance(edge.data, InterstateEdge): + # sdfg.arrays return arrays and scalars, it is invalid to write to them + if any([key in graph.sdfg.arrays for key in edge.data.assignments]): + raise InvalidSDFGInterstateEdgeError( + f'Assignment to a scalar or an array detected in an interstate edge: "{edge}"', graph.sdfg, + graph.edge_id(edge)) + + def _accessible(sdfg: 'dace.sdfg.SDFG', container: str, context: Dict[str, bool]): """ Helper function that returns False if a data container cannot be accessed in the current SDFG context. @@ -906,9 +920,14 @@ def validate_state(state: 'dace.sdfg.SDFGState', for oe in state.out_edges(dst_node)}): pass else: - raise InvalidSDFGEdgeError( - f"Memlet creates an invalid path (sink node {dst_node}" - " should be a data node)", sdfg, state_id, eid) + if isinstance(dst_node, nd.Tasklet) and len(dst_node.in_connectors) == 0 and len( + dst_node.out_connectors) == 0: + # Tasklets with no input or output connector -> sync tasklet -> OK + pass + else: + raise InvalidSDFGEdgeError( + f"Memlet creates an invalid path (sink node {dst_node}" + " should be a data node)", sdfg, state_id, eid) # If scope(dst) is disjoint from scope(src), it's an illegal memlet else: raise InvalidSDFGEdgeError("Illegal memlet between disjoint scopes", sdfg, state_id, eid) diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py index 4875279bea..cbccc6da37 100644 --- a/dace/transformation/helpers.py +++ b/dace/transformation/helpers.py @@ -1552,6 +1552,38 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio return None +def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool: + """ + Checks if the given node is enclosed within a Map whose schedule type + matches any in the `schedules` set. + + Parameters + ---------- + state : SDFGState + The State where the node resides + node : nodes.Node + The node to check. + schedules : set[dtypes.ScheduleType] + A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}). + + Returns + ---------- + bool + True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise. + """ + current = node + + while current is not None: + if isinstance(current, nodes.MapEntry): + if current.map.schedule in schedules: + return True + + parent = get_parent_map(state, current) + if parent is None: + return False + current, state = parent + + def redirect_edge(state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], new_src: Optional[nodes.Node] = None, diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py index de1dfcf645..fe0ed80e41 100644 --- a/dace/transformation/interstate/gpu_transform_sdfg.py +++ b/dace/transformation/interstate/gpu_transform_sdfg.py @@ -619,7 +619,70 @@ def _create_copy_out(arrays_used: Set[str]) -> Dict[str, str]: block.replace_meta_accesses({devicename: hostname}) # Step 9: Simplify - if not self.simplify: + if self.simplify: + sdfg.simplify() + + ######################################################################## + # In case the ExperimentalCUDACodeGen is selected, we handle, for backwards + # compatibility, the use of in-kernel, transient GPU_Global stored array here. + from dace.config import Config + if not Config.get('compiler', 'cuda', 'implementation') == 'experimental': return - sdfg.simplify() + # import needed modules + from dace.transformation import helpers + from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel + import warnings + + # Detect transient GPU_Global arrays inside GPU_Device-scheduled maps + transients_in_kernels: Set[Tuple[str, data.Array, nodes.MapEntry]] = set() + transient_outside_kernels: Set[Tuple[str, data.Array]] = set() + + for node, parent in sdfg.all_nodes_recursive(): + # ---------- Consider only transient GPU_Global arrays ------- + if not isinstance(node, nodes.AccessNode): + continue + + desc = node.desc(parent) + if not isinstance(desc, data.Array): + continue + if not desc.transient: + continue + if desc.storage != dtypes.StorageType.GPU_Global: + continue + + #------- Check whether transient/access node occurs within a kernel -------- + in_kernel = False + parent_map_info = helpers.get_parent_map(state=parent, node=node) + while parent_map_info is not None: + map_entry, map_state = parent_map_info + if (isinstance(map_entry, nodes.MapEntry) and map_entry.map.schedule == dtypes.ScheduleType.GPU_Device): + in_kernel = True + break + parent_map_info = helpers.get_parent_map(map_state, map_entry) + + if in_kernel: + transients_in_kernels.add((node.data, desc, map_entry)) + else: + transient_outside_kernels.add((node.data, desc)) + + # Skip transients that are used outside of GPU kernels, unless a separate, strictly kernel-local + # transient with the same name exists inside a kernel. In such cases, 'MoveArrayOutOfKernel' is + # still applied to the local one, and naming conflicts are handled automatically. + transient_defined_inside_kernel: Set[Tuple[str, nodes.MapEntry]] = set() + for data_name, array_desc, kernel_entry in transients_in_kernels: + if (data_name, array_desc) in transient_outside_kernels: + continue + else: + transient_defined_inside_kernel.add((data_name, kernel_entry)) + + # Apply the pass and warn the user of its use + for data_name, kernel_entry in transient_defined_inside_kernel: + warnings.warn( + f"Transient array '{data_name}' with storage type GPU_Global detected inside kernel {kernel_entry}. " + "GPU_Global memory cannot be allocated within GPU kernels, so this usage is semantically invalid. " + "As a best-effort fix, the array will be lifted outside the kernel as a non-transient GPU_Global array. " + "Any naming conflicts are resolved automatically. " + "Please avoid this pattern, as it is strongly discouraged and may lead to undefined behavior. " + "Note that this fix provides no guarantees, especially for unusual or complex use cases.") + MoveArrayOutOfKernel().apply_pass(sdfg, kernel_entry, data_name) diff --git a/dace/transformation/passes/analysis/infer_const_args.py b/dace/transformation/passes/analysis/infer_const_args.py new file mode 100644 index 0000000000..0f66d49732 --- /dev/null +++ b/dace/transformation/passes/analysis/infer_const_args.py @@ -0,0 +1,36 @@ +import dace +from dace.transformation import pass_pipeline as ppl, transformation +from typing import Dict, Set, Tuple +from dace import properties +import dace.sdfg.utils as sdutils + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InferConstantArguments(ppl.Pass): + """ + Evaluates which symbols and data are const within a scope. + """ + + CATEGORY: str = 'Analysis' + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Nothing + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return modified & ppl.Modifies.CFG & ppl.Modifies.SDFG & ppl.Modifies.Nodes + + def depends_on(self): + return {} + + def apply_pass(self, sdfg: dace.SDFG, pipeline_res: Dict) -> Dict[str, Tuple[Set[str], Set[str]]]: + const_args_dict = dict() + for node, parent_graph in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device: + const_args_dict[node.guid] = (sdutils.get_constant_data(node, parent_state=parent_graph), + sdutils.get_constant_symbols(node, parent_state=parent_graph)) + elif isinstance(node, dace.sdfg.nodes.NestedSDFG): + const_args_dict[node.guid] = (sdutils.get_constant_data(node.sdfg, parent_state=parent_graph), + sdutils.get_constant_symbols(node.sdfg, parent_state=parent_graph)) + + return const_args_dict diff --git a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py new file mode 100644 index 0000000000..0421d02049 --- /dev/null +++ b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py @@ -0,0 +1,170 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import warnings +from typing import Dict, List, Set, Tuple + +import sympy + +from dace import SDFG, SDFGState, dtypes, symbolic +from dace.codegen.targets.experimental_cuda_helpers import gpu_utils +from dace.sdfg import nodes +from dace.transformation import helpers, pass_pipeline as ppl + + +class InferGPUGridAndBlockSize(ppl.Pass): + """ + Infers the 3D CUDA launch configuration (grid and block sizes) for all GPU_Device map entries in the SDFG. + + This pass assumes the `AddThreadBlockMap` transformation has already been applied, ensuring that each kernel + either has an explicit thread block map. However it is applicable as long as each GPU_Device scheduled map + has an inner explicit GPU_ThreadBlock scheduled map. + + Block sizes are determined based on: + - Whether an explicit GPU_ThreadBlock map was inserted by `AddThreadBlockMap`. In this case, + the `gpu_block_size` attribute holds this information. + - Existing nested thread block maps and also the `gpu_block_size`, if present. + + Grid sizes are computed from the kernel map's range, normalized to a 3D shape. + + NOTE: + This pass does not handle dynamic parallelism (i.e., nested GPU_Device maps), + nor does it support GPU_ThreadBlock_Dynamic maps inside kernels. Behavior is unclear in + such cases. + """ + + def apply_pass(self, sdfg: SDFG, + kernels_with_added_tb_maps: Set[nodes.MapEntry]) -> Dict[nodes.MapEntry, Tuple[List, List]]: + """ + Analyzes the given SDFG to determine the 3D grid and block sizes for all GPU_Device map entries. + + Returns: + A dictionary mapping each GPU_Device MapEntry node to a tuple (grid_dimensions, block_dimensions). + """ + # Collect all GPU_Device map entries across the SDFG + kernel_maps: Set[Tuple[ + nodes.MapEntry, + SDFGState, + ]] = set() + for node, state in sdfg.all_nodes_recursive(): + if isinstance(node, nodes.MapEntry) and node.schedule == dtypes.ScheduleType.GPU_Device: + kernel_maps.add((node, state)) + + kernel_dimensions_map: Dict[nodes.MapEntry, Tuple[List, List]] = dict() + for map_entry, state in kernel_maps: + # Compute grid size + raw_grid = map_entry.map.range.size(True)[::-1] + grid_size = gpu_utils.to_3d_dims(raw_grid) + + # Compute Block size + if map_entry in kernels_with_added_tb_maps: + block_size = self._get_inserted_gpu_block_size(map_entry) + else: + block_size = self._infer_gpu_block_size(state, map_entry) + + block_size = gpu_utils.to_3d_dims(block_size) + gpu_utils.validate_block_size_limits(map_entry, block_size) + + kernel_dimensions_map[map_entry] = (grid_size, block_size) + + return kernel_dimensions_map + + def _get_inserted_gpu_block_size(self, kernel_map_entry: nodes.MapEntry) -> List: + """ + Returns the block size from a kernel map entry with an inserted thread-block map. + + Assumes the `gpu_block_size` attribute is set by the AddThreadBlockMap transformation. + """ + gpu_block_size = kernel_map_entry.map.gpu_block_size + + if gpu_block_size is None: + raise ValueError("Expected 'gpu_block_size' to be set. This kernel map entry should have been processed " + "by the AddThreadBlockMap transformation.") + + return gpu_block_size + + def _infer_gpu_block_size(self, state: SDFGState, kernel_map_entry: nodes.MapEntry) -> List: + """ + Infers the GPU block size for a kernel map entry based on nested GPU_ThreadBlock maps. + + If the `gpu_block_size` attribute is set, it is assumed to be user-defined (not set by + a transformation like `AddThreadBlockMap`), and all nested thread-block maps must fit within it. + Otherwise, the block size is inferred by overapproximating the range sizes of all inner + GPU_ThreadBlock maps of kernel_map_entry. + + + Example: + for i in dace.map[0:N:32] @ GPU_Device: + for j in dace.map[0:32] @ GPU_ThreadBlock: + ... + for l in dace.map[0:23] @ GPU_ThreadBlock: + for k in dace.map[0:16] @ GPU_ThreadBlock: + ... + + Inferred GPU block size is [32, 1, 1] + """ + # Identify nested threadblock maps + threadblock_maps = self._get_internal_threadblock_maps(state, kernel_map_entry) + + # guard check + if not threadblock_maps: + raise ValueError(f"{self.__class__.__name__} expects at least one explicit nested GPU_ThreadBlock map, " + "as it assumes AddThreadBlockMap was applied beforehand.\n" + f"Check for issues in that transformation or ensure AddThreadBlockMap was applied.") + + # Overapproximated block size enclosing all inner ThreadBlock maps + block_size = kernel_map_entry.map.gpu_block_size + detected_block_sizes = [block_size] if block_size is not None else [] + for tb_map in threadblock_maps: + + # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) + # and collapse to GPU-compatible 3D dimensions + tb_size = [symbolic.overapproximate(s) for s in tb_map.range.size()[::-1]] + tb_size = gpu_utils.to_3d_dims(tb_size) + + if block_size is None: + block_size = tb_size + else: + block_size = [sympy.Max(sz1, sz2) for sz1, sz2 in zip(block_size, tb_size)] + + if block_size != tb_size or len(detected_block_sizes) == 0: + detected_block_sizes.append(tb_size) + + # Check for conflicting or multiple thread-block sizes + # - If gpu_block_size is explicitly defined (by the user) and conflicts with detected map sizes, raise an error + # - Otherwise, emit a warning when multiple differing sizes are detected, and over-approximate + if len(detected_block_sizes) > 1: + kernel_map_label = kernel_map_entry.map.label + + if kernel_map_entry.map.gpu_block_size is not None: + raise ValueError('Both the `gpu_block_size` property and internal thread-block ' + 'maps were defined with conflicting sizes for kernel ' + f'"{kernel_map_label}" (sizes detected: {detected_block_sizes}). ' + 'Use `gpu_block_size` only if you do not need access to individual ' + 'thread-block threads, or explicit block-level synchronization (e.g., ' + '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' + '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' + 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + + else: + warnings.warn('Multiple thread-block maps with different sizes detected for ' + f'kernel "{kernel_map_label}": {detected_block_sizes}. ' + f'Over-approximating to block size {block_size}.\n' + 'If this was not the intent, try tiling one of the thread-block maps to match.') + + return block_size + + def _get_internal_threadblock_maps(self, state: SDFGState, + kernel_map_entry: nodes.MapEntry) -> List[nodes.MapEntry]: + """ + Returns GPU_ThreadBlock MapEntries nested within a given the GPU_Device scheduled kernel map + (kernel_map_entry). + + Returns: + A List of GPU_ThreadBlock scheduled maps. + """ + threadblock_maps = [] + + for _, scope in helpers.get_internal_scopes(state, kernel_map_entry): + if isinstance(scope, nodes.MapEntry) and scope.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + threadblock_maps.append(scope) + + return threadblock_maps diff --git a/dace/transformation/passes/fix_test.py b/dace/transformation/passes/fix_test.py new file mode 100644 index 0000000000..80caa2d563 --- /dev/null +++ b/dace/transformation/passes/fix_test.py @@ -0,0 +1,110 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import numpy as np +import sympy as sp + +import dace +from dace import dtypes, properties, SDFG +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types + + +@properties.make_properties +@transformation.explicit_cf_compatible +class Fix(ppl.Pass): + """ + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Descriptors | ppl.Modifies.Nodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, dace.data.Data]: + + from dace.transformation.helpers import get_parent_map + + skip = set() + to_be_moved = set() + names: Dict = dict() + for node, parent_state in sdfg.all_nodes_recursive(): + if not isinstance(node, nodes.AccessNode): + continue + + map_parent = None + state = parent_state + current = node + while current is not None: + if isinstance(current, nodes.MapEntry): + if current.map.schedule == dace.dtypes.ScheduleType.GPU_Device: + map_parent = current + break + + parent = get_parent_map(state, current) + if parent is None: + break + current, state = parent + + if map_parent is None: + continue + + if node.data not in parent_state.sdfg.arrays: + continue + + data_desc = node.desc(parent_state) + if not data_desc.storage == dtypes.StorageType.Register: + continue + + if isinstance(data_desc, dace.data.View) or data_desc.lifetime == dtypes.AllocationLifetime.Persistent: + continue + + break_cond = False + for edge, parent in sdfg.all_edges_recursive(): + if not isinstance(parent, dace.SDFGState): + continue + src = edge.src + if edge.dst_conn == node.data and isinstance(src, nodes.AccessNode) and src.data != node.data: + break_cond = True + skip.add(src.data) + + if break_cond: + continue + + shape = data_desc.shape + size_expr = np.prod(shape) + + # Try to evaluate the inequality + cmp = sp.simplify(size_expr > 64) + + if cmp is sp.true: # definitely larger + move_out = True + elif cmp is sp.false: # definitely safe + move_out = False + else: + # TODO: explain yakup and myself + # undecidable case (symbolic expression) + move_out = False # or warn, depending on policy + + if move_out: + to_be_moved.add((node.data, data_desc, map_parent)) + + for name, desc, map_parent in to_be_moved: + if name in skip: + continue + + desc.storage = dtypes.StorageType.GPU_Global + desc.transient = True + names[name] = map_parent + + return names diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py new file mode 100644 index 0000000000..225dba00e4 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py @@ -0,0 +1,70 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + + +@properties.make_properties +@transformation.explicit_cf_compatible +class ConnectGPUStreamsToKernels(ppl.Pass): + """ + This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps). + + Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes, + indicating which GPU stream each kernel is assigned to. These assignments are e.g. + used when launching the kernels. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + # Retrieve the GPU stream array name and the prefix for individual stream variables + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Link kernels to their assigned GPU streams + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a kernel entry - continue + if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device): + continue + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}" + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Assign the GPU stream to the kernel entry + kernel_entry = node + kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_in = state.add_access(stream_array_name) + state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name, + dace.Memlet(accessed_gpu_stream)) + + # Assign the GPU stream to the kernel exit + kernel_exit = state.exit_node(kernel_entry) + kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_out = state.add_access(stream_array_name) + state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None, + dace.Memlet(accessed_gpu_stream)) + + return {} diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py new file mode 100644 index 0000000000..58d9ff70ff --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py @@ -0,0 +1,80 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels + +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class ConnectGPUStreamsToTasklets(ppl.Pass): + """ + This pass ensures that tasklets which require access to their assigned GPU stream + are provided with it explicitly. + + Such tasklets typically originate from expanded LibraryNodes targeting GPUs. + These nodes may reference the special placeholder variable `__dace_current_stream`, + which is expected to be defined during unparsing in `cpp.py`. + + To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use + the GPU stream AccessNode directly. + + Note that this pass is similar to `ConnectGPUStreamsToKernels`. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + # Retrieve the GPU stream's array name + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code + # and provide them the needed GPU stream explicitly + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a tasklet - continue + if not isinstance(node, nodes.Tasklet): + continue + + # Tasklet does not need use its assigned GPU stream - continue + if not STREAM_PLACEHOLDER in node.code.as_string: + continue + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_conn = STREAM_PLACEHOLDER + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Provide the GPU stream explicitly to the tasklet + stream_array_in = state.add_access(stream_array_name) + stream_array_out = state.add_access(stream_array_name) + + node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t) + node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True) + + state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) + + return {} diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py new file mode 100644 index 0000000000..0ad3c2e7c0 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py @@ -0,0 +1,249 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Dict, List, Set, Type, Union + +import dace +from dace import SDFG, SDFGState, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.sdfg.graph import Graph, NodeT +from dace.transformation import pass_pipeline as ppl, transformation + +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class NaiveGPUStreamScheduler(ppl.Pass): + """ + Assigns GPU streams to nodes and stores the assignments in a dictionary. + This can be useful for enabling asynchronous and parallel GPU computation using GPU streams. + + Strategy Overview: + ------------------ + - GPU stream assignment is based on weakly connected components (WCCs) within each state. + - Nodes in the same WCC are assigned to the same stream. + - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0). + - In nested SDFGs: + * Stream assignment is inherited from the parent component, + * All internal components share the parent's stream. + - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration. + + Example: + -------- + A state with the following independent chains: + K1 → K2 + K3 → K4 → K5 + K6 + + would be scheduled as: + K1, K2 → stream 0 + K3, K4, K5 → stream 1 + K6 → stream 2 + + (assuming no limit on the number of concurrent streams) + + Note: + ----- + These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams. + """ + + def __init__(self): + # Maximum number of concurrent streams allowed (from config). + # Cached locally for frequent reuse. + self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Nothing + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]: + """ + Assigns GPU streams to nodes within the given SDFG. + + Parameters + ---------- + sdfg : SDFG + The top-level SDFG to process. + pipeline_results : Dict + Unused. + + Returns + ------- + Dict[nodes.Node, int] + A dictionary mapping each node to its assigned GPU stream. + """ + stream_assignments: Dict[nodes.Node, int] = dict() + for state in sdfg.states(): + self._assign_gpu_streams_in_state(sdfg, False, state, stream_assignments, 0) + + return stream_assignments + + def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, + stream_assignments: Dict[nodes.Node, int], gpu_stream: int) -> None: + """ + Assigns GPU streams to nodes in a single state. + + If inside a nested SDFG, components inherit the parent's stream. + Otherwise, each connected component gets a different stream. + Nested SDFGs are processed recursively. + + Parameters + ---------- + sdfg : SDFG + The SDFG containing the state. + in_nested_sdfg : bool + True if the state is in a nested SDFG. + state : SDFGState + The state to process. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to assigned GPU streams (updated in-place). + gpu_stream : int + The current GPU stream ID. + + Returns + ------- + None + """ + components = self._get_weakly_connected_nodes(state) + + for component in components: + + if not self._requires_gpu_stream(state, component): + continue + + nodes_assigned_before = len(stream_assignments) + + for node in component: + stream_assignments[node] = gpu_stream + if isinstance(node, nodes.NestedSDFG): + for nested_state in node.sdfg.states(): + self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, stream_assignments, gpu_stream) + + # Move to the next stream if we have assigned streams to any node in this component + # (careful: if nested, states are in same component) + if not in_nested_sdfg and len(stream_assignments) > nodes_assigned_before: + gpu_stream = self._next_stream(gpu_stream) + + def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: + """ + Returns all weakly connected components in the given directed graph. + + A weakly connected component is a maximal group of nodes such that each pair + of nodes is connected by a path when ignoring edge directions. + + Parameters + ---------- + graph: Graph + A directed graph instance. + + Returns + ------- + List[Set[Node_T]] + + A list containing sets of nodes, with each set corresponding to a weakly + connected component. + """ + visited: Set[NodeT] = set() + components: List[Set[NodeT]] = [] + + for node in graph.nodes(): + if node in visited: + continue + + # Start a new weakly connected component + component: Set[NodeT] = set() + stack = [node] + + while stack: + current = stack.pop() + if current in visited: + continue + + visited.add(current) + component.add(current) + + for neighbor in graph.neighbors(current): + if neighbor not in visited: + stack.append(neighbor) + + components.append(component) + + return components + + def _next_stream(self, gpu_stream: int) -> int: + """ + Compute the next CUDA stream index according to the concurrency configuration. + + Behavior depends on the configured max_concurrent_streams value: + - If 0: unlimited streams allowed, so increment the stream index by one. + - If -1: default setting, always return stream 0 (no concurrency). + - Otherwise: cycle through stream indices from 0 up to max_concurrent_streams - 1. + + Parameters + ---------- + gpu_stream : int + The current CUDA stream index. + + Returns + ------- + int + The next CUDA stream index based on the concurrency policy. + """ + if self._max_concurrent_streams == 0: + return gpu_stream + 1 + elif self._max_concurrent_streams == -1: + return 0 + else: + return (gpu_stream + 1) % self._max_concurrent_streams + + def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool: + """ + Check whether a connected component in an SDFG state should be assigned + a GPU stream. + + A component requires a GPU stream if it contains at least one of: + - An AccessNode with GPU global memory storage, + - A MapEntry scheduled on a GPU device, + - A Tasklet whose code includes the stream placeholder. + + Parameters + ---------- + state : SDFGState + The state containing the component. + component : Set[NodeT] + The set of nodes that form the connected component. + + Returns + ------- + bool + True if the component requires a GPU stream, False otherwise. + """ + + def gpu_relevant(node, parent) -> bool: + if (isinstance(node, nodes.AccessNode) and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global): + return True + + elif (isinstance(node, nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device): + return True + + elif (isinstance(node, nodes.Tasklet) and STREAM_PLACEHOLDER in node.code.as_string): + return True + + return False + + for node in component: + if isinstance(node, nodes.NestedSDFG): + if any(gpu_relevant(node, parent) for node, parent in node.sdfg.all_nodes_recursive()): + return True + + else: + if gpu_relevant(node, state): + return True + + return False diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py new file mode 100644 index 0000000000..7e1a62b29c --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py @@ -0,0 +1,274 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_copy_tasklet import InsertGPUCopyTasklets + + +@properties.make_properties +@transformation.explicit_cf_compatible +class GPUStreamTopologySimplification(ppl.Pass): + """ + Simplifies an SDFG after GPU stream nodes have been added. + + This pass is optional; the SDFG works without it, but it cleans up + the topology by merging adjacent or redundant GPU stream AccessNodes. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets + } + + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Simplify the SDFG topology by merging adjacent GPU stream nodes. + """ + self._merge_close_gpustream_nodes(sdfg) + + self._merge_gpustreams_special_case(sdfg) + return {} + + def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: + """ + Merge "close" GPU stream AccessNodes in the SDFG. + + This function looks for a predecessor GPU stream AccessNode that can be merged + with any successor GPU stream AccessNodes of its grand-predecessors. + + Example: + + Consider two GPU copy tasklets connected via distinct GPU stream AccessNodes: + the corresponding subgraph looks like this: + + -> Sink GPU Source GPU -> + ¦ ¦ + Tasklet ------> Data AccessNode -----> Tasklet + + This function would merge the sink and source node to simplify the SDFG. + """ + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Skip AccessNodes + if isinstance(node, nodes.AccessNode): + continue + + # Find GPU stream AccessNode predecessors with no incoming edges + # (i.e. source GPU stream AccessNodes) + node_predecessors = state.predecessors(node) + preceeding_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + + # Skip if there are no preceding GPU stream sources + if len(preceeding_gpustream_sources) == 0: + continue + + # If multiple GPU stream sources exist, merge them; otherwise, use the single source + if len(preceeding_gpustream_sources) > 1: + combined_stream_node = preceeding_gpustream_sources.pop() + for preceeding_gpu_stream in preceeding_gpustream_sources: + # Note: there are no ingoing edges + for out_edge in state.out_edges(preceeding_gpu_stream): + _, src_conn, dst, dst_conn, data = out_edge + state.add_edge(combined_stream_node, src_conn, dst, dst_conn, data) + state.remove_edge(out_edge) + state.remove_node(preceeding_gpu_stream) + + else: + combined_stream_node = preceeding_gpustream_sources.pop() + + # Merge grand-predecessors' successors sink GPU streams with predecessor source GPU stream + node_grand_predecessors = [ + grand_pred for pred in node_predecessors for grand_pred in state.predecessors(pred) + ] + node_gp_successors_streams = [ + succ_of_gp for gp in node_grand_predecessors for succ_of_gp in state.successors(gp) + if isinstance(succ_of_gp, nodes.AccessNode) + and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ_of_gp) == 0 + ] + + # remove duplicates + node_gp_successors_streams = list(set(node_gp_successors_streams)) + + for gp_succ_stream in node_gp_successors_streams: + for edge in state.in_edges(gp_succ_stream): + src, src_conn, _, dst_conn, data = edge + state.add_edge(src, src_conn, combined_stream_node, dst_conn, data) + state.remove_edge(edge) + # Note: the grand-predecessor's successor GPU stream is a sink node and has no + # outgoing edges + state.remove_node(gp_succ_stream) + + def _merge_gpustreams_special_case(self, sdfg: SDFG) -> None: + """ + Special-case simplification of GPU stream AccessNodes. + + This pass detects the following pattern: + - A GPU stream AccessNode `X` has a predecessor and a successor (i.e. at least one of both). + - Between the predecessor and successor lie one or more tasklets. + - These tasklets use their own distinct GPU stream AccessNodes (not `X`), + which are connected only to the tasklet itself. + + To simplify the topology, redundant streams are merged: + - A single unified input GPU stream connects to the predecessor and replaces (merges) + the per-tasklet input streams. + - A single unified output GPU stream connects to the successor and replaces (merges) + the per-tasklet output streams. + + + The simplification is easier to understand visually than in words. + Inspect the intermediate SDFGs produced by the minimal example below + to see the effect of the stream merging. + + Example + ------- + @dace.program + def example(A: dace.uint32[128], B: dace.uint32[128], + C: dace.uint32[128], D: dace.uint32[128]): + for i in dace.map[0:128:1]: + B[i] = A[i] + for i in dace.map[0:128:1]: + D[i] = C[i] + + sdfg = example.to_sdfg() + sdfg.apply_gpu_transformations() + """ + # Get the name of the GPU stream arry + gpustream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + #------------------------- Preprocess: Gather Information ---------------------------- + + # For each GPU Stream AccessNode having a predecessor and a successor: + # Determine with which Tasklet Source and which Tasklet sink nodes lie between its predecessor + # and its successor + merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + + for node, state in sdfg.all_nodes_recursive(): + + # Skip non-tasklets + if not isinstance(node, nodes.Tasklet): + continue + + # The tasklets of interest should have exactly one preceeding source GPU node and one following sink GPU node + # If not, we skip + node_predecessors = state.predecessors(node) + node_successors = state.successors(node) + downstream_gpustream_sinks = [ + succ for succ in node_successors if isinstance(succ, nodes.AccessNode) + and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0 + ] + upstream_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + + # Skip not considered case + if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks) + and len(upstream_gpustream_sources) == 1): + continue + + # Look for potential predecessor of a "passthrough" GPU Stream AccessNode + # which would also be the grand-predeccessor of the current node (=tasklet) + candidate_predecessor = [] + for pred in node_predecessors: + for grand_pred in state.predecessors(pred): + + # Current nodes grand pred is a candidate of a predecessor of a "passthrough" GPU Stream AccessNode + candidate = grand_pred + + # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors + if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule + == dtypes.ScheduleType.GPU_Device or isinstance(candidate, nodes.Tasklet)): + continue + + has_passthrough_gpustream = any( + (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) and ( + state.in_degree(succ) > 0 and state.out_degree(succ) > 0) + for succ in state.successors(candidate)) + + if has_passthrough_gpustream: + candidate_predecessor.append(candidate) + + # Not "close" passthrough GPU node exists if no candidate predecessor exists + if len(candidate_predecessor) == 0: + continue + + # Niche case, more than one "close" passthrough GPU node exists: Out of scope + # Ignore this case (note: This Pass only makes the Graph visually nicer, so skipping has + # no effect on correctness) + if len(candidate_predecessor) > 1: + continue + + # Get the Kernel Exits GPU stream + candidate_predecessor = candidate_predecessor[0] + passthrough_gpu_node = [ + succ for succ in state.successors(candidate_predecessor) + if isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t + ][0] + + # Collect and store the GPU stream merging information + pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] # Note: Len is 1 + succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] # Note: Len is 1 + if (passthrough_gpu_node, state) in merge_source_gpustream: + merge_source_gpustream[(passthrough_gpu_node, state)].append(pre_gpustream) + merge_sink_gpustream[(passthrough_gpu_node, state)].append(succ_gpustream) + else: + merge_source_gpustream[(passthrough_gpu_node, state)] = [pre_gpustream] + merge_sink_gpustream[(passthrough_gpu_node, state)] = [succ_gpustream] + + #------------------------- Merge the GPU Stream AccessNodes ---------------------------- + for passthrough_gpu_node, state in merge_sink_gpustream.keys(): + + # Add new AccessNodes which merge the other loose streams + unified_in_stream = state.add_access(gpustream_array_name) + unified_out_stream = state.add_access(gpustream_array_name) + + for in_edge in state.in_edges(passthrough_gpu_node): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_in_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + + for out_edge in state.out_edges(passthrough_gpu_node): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + + for source_stream in merge_source_gpustream[passthrough_gpu_node, state]: + for out_edge in state.out_edges(source_stream): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_in_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + state.remove_node(source_stream) + + for sink_stream in merge_sink_gpustream[passthrough_gpu_node, state]: + for in_edge in state.in_edges(sink_stream): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_out_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + state.remove_node(sink_stream) + + state.remove_node(passthrough_gpu_node) diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py new file mode 100644 index 0000000000..162aa6143f --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py @@ -0,0 +1,166 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace import memlet as mm +from dace.codegen.targets.gpu_helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy +from dace.config import Config +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUCopyTasklets(ppl.Pass): + """ + This pass inserts explicit copy tasklets for data transfers that need to be handled + by the GPU and occur outside a kernel (for example, copying data from host memory + to the GPU before executing a kernel). + + It identifies such copy locations and inserts the corresponding tasklets. For each + memlet path describing a copy, the first edge is duplicated: one edge goes from the original + source to the tasklet, and the other from the tasklet to the original destination, while + the original edge is removed. + + This is experimental and could later serve as inspiration for making all copies explicit. + Considerations for future work include allowing tasklets to access array addresses + from connectors and describing in memlets how data will be moved, since currently + tasklets only support value inputs. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets + } + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: + """ + Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling. + Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel + function. + + Parameters + ---------- + sdfg : SDFG + The SDFG to transform by adding out-of-kernel GPU copy tasklets. + pipeline_results : Dict[str, Any] + Results from previous transformation passes, including GPU stream assignments. + + Returns + ------- + dict + Currently returns an empty dictionary. + """ + # Prepare GPU stream + gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + gpustream_array_name, gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Initialize the strategy for copies that occur outside of kernel execution + out_of_kernel_copy = OutOfKernelCopyStrategy() + + # Get all data copies to process the out of kernel copies + copy_worklist = self.find_all_data_copies(sdfg) + + for copy_sdfg, state, src_node, dst_node, edge in copy_worklist: + + copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments) + + # Only insert copy tasklets for GPU related copies occuring out of the + # kernel (i.e. a GPU_device scheduled map) + if not out_of_kernel_copy.applicable(copy_context): + continue + + # Generatae the copy call + code = out_of_kernel_copy.generate_copy(copy_context) + + # Prepare GPU ustream connectors and the stream to be accessed from the + # GPU stream array + gpustream_id = gpustream_assignments[dst_node] + gpustream_var_name = f"{gpustream_var_name_prefix}{gpustream_id}" + accessed_gpustream = f"{gpustream_array_name}[{gpustream_id}]" + + # Create the tasklet and add GPU stream related connectors + tasklet = state.add_tasklet("gpu_copy", {}, {}, code, language=dtypes.Language.CPP) + tasklet.add_in_connector(gpustream_var_name, dtypes.gpuStream_t, True) + tasklet.add_out_connector(gpustream_var_name, dtypes.gpuStream_t, True) + + # Add incoming and outgoing GPU stream accessNodes to the tasklet + in_gpustream = state.add_access(gpustream_array_name) + out_gpustream = state.add_access(gpustream_array_name) + state.add_edge(in_gpustream, None, tasklet, gpustream_var_name, dace.Memlet(accessed_gpustream)) + state.add_edge(tasklet, gpustream_var_name, out_gpustream, None, dace.Memlet(accessed_gpustream)) + + # Put the tasklet in between the edge + dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge + state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet)) + state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(edge) + + return {} + + def find_all_data_copies( + self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]: + """ + Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node, + destination node, and the first memlet edge of in the memlet path between source and destination node. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze for potential data copies. + + Returns + ------- + List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] + A list of tuples representing the data copy, each containing: + - The SDFG containing the copy + - The state in which the copy occurs + - The source node of the copy + - The destination node of the copy + - The first memlet edge representing the data movement + """ + copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = [] + visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set() + + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for edge in state.edges(): + + # Skip edges that were already processed + if edge in visited_edges: + continue + + # Get the memlet path and mark all edges in the path as visited + memlet_path = state.memlet_path(edge) + visited_edges.update(set(memlet_path)) + + # Get source and destination noces + first_edge = memlet_path[0] + last_edge = memlet_path[-1] + src_node = first_edge.src + dst_node = last_edge.dst + + # Skip empty memlets + if first_edge.data.subset is None: + continue + + # Add copy to the worklist + copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) + + return copy_worklist diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py new file mode 100644 index 0000000000..2d2c1137de --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py @@ -0,0 +1,290 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import dtypes, properties, SDFG, SDFGState +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.helpers import is_within_schedule_types +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets + +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamSyncTasklets(ppl.Pass): + """ + Inserts GPU stream synchronization tasklets in an SDFG where needed. + + This pass uses a heuristic approach to find locations matching specific patterns + that require synchronization. Additional locations can be added easily if new + cases are discovered. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets + } + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Inserts GPU stream synchronization tasklets at required locations + after certain nodes and at the end of a state, for GPU streams used in the state. + """ + stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler'] + + # Get sync locations + sync_state, sync_node = self._identify_sync_locations(sdfg, stream_assignments) + + # Synchronize after a node when required + self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments) + + # Synchronize all used streams at the end of a state + self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments) + return {} + + def _identify_sync_locations( + self, sdfg: SDFG, + stream_assignments: Dict[nodes.Node, int]) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: + """ + Heuristically identifies GPU stream synchronization points in an SDFG. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream ids. + + Returns + ------- + Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]] + - **sync_state**: Maps each state to the set of stream IDs that should be + synchronized at the end of the state. + - **sync_node**: The keys of this dictionary are nodes after which synchronization + is needed, and their corresponding value is the state they belong to. + """ + + # ------------------ Helper predicates ----------------------------- + + def is_gpu_global_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage == dtypes.StorageType.GPU_Global + + def is_nongpu_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN + + def is_kernel_exit(node): + return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device + + def is_sink_node(node, state): + return state.out_degree(node) == 0 + + def edge_within_kernel(state, src, dst): + gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN + src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) + dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) + return src_in_kernel and dst_in_kernel + + def is_tasklet_with_stream_use(src): + return isinstance(src, nodes.Tasklet) and STREAM_PLACEHOLDER in src.code.as_string + + # ------------------ Sync detection logic ----------------------------- + + sync_state: Dict[SDFGState, Set[int]] = {} + sync_node: Dict[nodes.Node, SDFGState] = {} + + for edge, state in sdfg.all_edges_recursive(): + src, dst = edge.src, edge.dst + + # Ensure state is initialized in sync_state + if state not in sync_state: + sync_state[state] = set() + + # --- Heuristics for when to sync --- + if (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) + and not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): + sync_node[dst] = state + sync_state[state].add(stream_assignments[dst]) + + elif (is_nongpu_accessnode(src, state) and is_gpu_global_accessnode(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and not is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[src]) + sync_state[state].add(stream_assignments[src]) + + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[dst]) + + elif is_tasklet_with_stream_use(src): + sync_state[state].add(stream_assignments[src]) + + else: + continue + + # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side + if not isinstance(state, SDFGState): + raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " + "Expected 'SDFGState'. Please handle this case explicitly.") + + # Remove states with no syncs + sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0} + + return sync_state, sync_node + + def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Inserts GPU stream synchronization tasklets at the end of SDFG states. + + For each state that requires synchronization, this method: + + 1. Generates a tasklet that synchronizes all assigned GPU streams using + the appropriate backend (e.g., CUDA). + 2. Ensures all other operations in the state complete before synchronization + by connecting all sink nodes to the tasklet. + 3. Guarantees that only a single GPU stream AccessNode connects to the sync + tasklet, creating one if needed. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_state : Dict[SDFGState, Set[int] + Mapping of states to sets of stream IDs that require synchronization at the end of the state. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for state, streams in sync_state.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Build synchronization calls for all streams used in this state + sync_code_lines = [] + for stream in streams: + gpu_stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({gpu_stream_var_name}));" + sync_code_lines.append(sync_call) + sync_code = "\n".join(sync_code_lines) + + # Create the tasklet + tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", + inputs=set(), + outputs=set(), + code=sync_code, + language=dtypes.Language.CPP) + + # ----------------- Connect sink nodes to the synchronization tasklet ----------------- + + # 1. Seperate GPU stream sink nodes and other sink nodes + stream_sink_nodes: List[nodes.AccessNode] = [] + non_stream_sink_nodes: List[nodes.Node] = [] + for sink_node in state.sink_nodes(): + if isinstance(sink_node, nodes.AccessNode) and sink_node.desc(state).dtype == dtypes.gpuStream_t: + stream_sink_nodes.append(sink_node) + + elif sink_node != tasklet: + non_stream_sink_nodes.append(sink_node) + + # 2. Connect non-stream sink nodes to the sync tasklet + for sink_node in non_stream_sink_nodes: + state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) + + # 3. Connect a single GPU stream sink node (create or merge if needed) + if len(stream_sink_nodes) == 0: + combined_stream_node = state.add_access(stream_array_name) + + else: + combined_stream_node = stream_sink_nodes.pop() + for stream_node in stream_sink_nodes: + for edge in state.in_edges(stream_node): + state.add_edge(edge.src, edge.src_conn, combined_stream_node, edge.dst_conn, edge.data) + state.remove_edge(edge) + state.remove_node(stream_node) + + # Connect back to output stream node + output_stream_node = state.add_access(combined_stream_node.data) + for stream in streams: + accessed_gpu_stream = f"{stream_array_name}[{stream}]" + conn = f"{stream_var_name_prefix}{stream}" # Note: Same as "gpu_stream_var_name" from tasklet + + tasklet.add_in_connector(conn, dtypes.gpuStream_t) + tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True) + state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) + + def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Insert a GPU stream synchronization tasklet immediately after specified nodes. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_node : Dict[nodes.Node, SDFGState] + Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for node, state in sync_node.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Get assigned GPU stream + stream = stream_assignments.get(node, "nullptr") + if stream == "nullptr": + raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.") + + # Create the tasklet + stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n" + tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", + inputs=set(), + outputs=set(), + code=sync_call, + language=dtypes.Language.CPP) + + #----------------- Place tasklet between node and successors, link GPU streams ---------------- + + # 1. Put the tasklet between the node and its successors + for succ in state.successors(node): + state.add_edge(tasklet, None, succ, None, dace.Memlet()) + state.add_edge(node, None, tasklet, None, dace.Memlet()) + + # 2. Connect tasklet to GPU stream AccessNodes + in_stream = state.add_access(stream_array_name) + out_stream = state.add_access(stream_array_name) + accessed_stream = f"{stream_array_name}[{stream}]" + state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream)) + state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream)) + tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True) + tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py new file mode 100644 index 0000000000..f45caa5dd0 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py @@ -0,0 +1,154 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import SDFG, dtypes, properties +from dace.config import Config +from dace.sdfg import is_devicelevel_gpu +from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, Node, Tasklet +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler + +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToSDFGs(ppl.Pass): + """ + Inserts a GPU stream array into the top-level SDFG and propagates it to all + nested SDFGs that require it, including intermediate SDFGs along the hierarchy. + + This pass guarantees that every relevant SDFG has the array defined, avoiding + duplication and allowing subsequent passes in the GPU stream pipeline to rely + on its presence without redefining it. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Ensure that a GPU stream array is available in all SDFGs that require it. + + The pass creates the array once at the top-level SDFG and propagates it + down the hierarchy by inserting matching arrays in child SDFGs and wiring + them through nested SDFG connectors. This way, all SDFGs share a consistent + reference to the same GPU stream array. + """ + + # Extract stream array name and number of streams to allocate + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + stream_assignments: Dict[Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + + # Add the GPU stream array at the top level + sdfg.add_transient(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Ensure GPU stream array is defined where required + for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg): + + # Skip if this child already has the array (inserted higher up in the hierarchy) + if stream_array_name in child_sdfg.arrays: + continue + + # Add the array to the child SDFG + inner_sdfg = child_sdfg + inner_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Walk up the hierarchy until the array is found, inserting it into each parent + outer_sdfg = inner_sdfg.parent_sdfg + while stream_array_name not in outer_sdfg.arrays: + + # Insert array in parent SDFG + outer_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Connect parent SDFG array to nested SDFG node + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(stream_array_name)) + + # Continue climbing up the hierarchy + inner_sdfg = outer_sdfg + outer_sdfg = outer_sdfg.parent_sdfg + + # Ensure final connection from the first parent that had the array down to this SDFG + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]")) + + outer_sdfg = inner_sdfg.parent_sdfg + + return {} + + def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: + """ + Identify all child SDFGs that require a GPU stream array in their + array descriptor store. A child SDFG requires a GPU stream if: + + - It launches GPU kernels (MapEntry/MapExit with GPU_Device schedule). + - It contains special Tasklets (e.g., from library node expansion) that + use the GPU stream they are assigned to in the code. + - It accesses GPU global memory outside device-level GPU scopes, which + implies memory copies or kernel data feeds. + + Parameters + ---------- + sdfg : SDFG + The root SDFG to inspect. + + Returns + ------- + Set[SDFG] + The set of child SDFGs that need a GPU stream array in their array descriptor + store. + """ + requiring_gpu_stream = set() + for child_sdfg in sdfg.all_sdfgs_recursive(): + + # Skip the root SDFG itself + if child_sdfg is sdfg: + continue + + for state in child_sdfg.states(): + for node in state.nodes(): + + # Case 1: Kernel launch nodes + if isinstance(node, (MapEntry, MapExit)) and node.map.schedule == dtypes.ScheduleType.GPU_Device: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 2: Tasklets that use GPU stream in their code + if isinstance(node, Tasklet) and STREAM_PLACEHOLDER in node.code.as_string: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 3: Accessing GPU global memory outside device-level scopes + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Global + and not is_devicelevel_gpu(state.sdfg, state, node)): + requiring_gpu_stream.add(child_sdfg) + break + + # Stop scanning this SDFG once a reason is found + if child_sdfg in requiring_gpu_stream: + break + + return requiring_gpu_stream diff --git a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py new file mode 100644 index 0000000000..bd913ae469 --- /dev/null +++ b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py @@ -0,0 +1,274 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets +from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + + +@properties.make_properties +@transformation.explicit_cf_compatible +class GPUStreamTopologySimplification(ppl.Pass): + """ + Simplifies an SDFG after GPU stream nodes have been added. + + This pass is optional; the SDFG works without it, but it cleans up + the topology by merging adjacent or redundant GPU stream AccessNodes. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets + } + + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Simplify the SDFG topology by merging adjacent GPU stream nodes. + """ + self._merge_close_gpustream_nodes(sdfg) + + self._merge_gpustreams_special_case(sdfg) + return {} + + def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: + """ + Merge "close" GPU stream AccessNodes in the SDFG. + + This function looks for a predecessor GPU stream AccessNode that can be merged + with any successor GPU stream AccessNodes of its grand-predecessors. + + Example: + + Consider two GPU copy tasklets connected via distinct GPU stream AccessNodes: + the corresponding subgraph looks like this: + + -> Sink GPU Source GPU -> + ¦ ¦ + Tasklet ------> Data AccessNode -----> Tasklet + + This function would merge the sink and source node to simplify the SDFG. + """ + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Skip AccessNodes + if isinstance(node, nodes.AccessNode): + continue + + # Find GPU stream AccessNode predecessors with no incoming edges + # (i.e. source GPU stream AccessNodes) + node_predecessors = state.predecessors(node) + preceeding_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + + # Skip if there are no preceding GPU stream sources + if len(preceeding_gpustream_sources) == 0: + continue + + # If multiple GPU stream sources exist, merge them; otherwise, use the single source + if len(preceeding_gpustream_sources) > 1: + combined_stream_node = preceeding_gpustream_sources.pop() + for preceeding_gpu_stream in preceeding_gpustream_sources: + # Note: there are no ingoing edges + for out_edge in state.out_edges(preceeding_gpu_stream): + _, src_conn, dst, dst_conn, data = out_edge + state.add_edge(combined_stream_node, src_conn, dst, dst_conn, data) + state.remove_edge(out_edge) + state.remove_node(preceeding_gpu_stream) + + else: + combined_stream_node = preceeding_gpustream_sources.pop() + + # Merge grand-predecessors' successors sink GPU streams with predecessor source GPU stream + node_grand_predecessors = [ + grand_pred for pred in node_predecessors for grand_pred in state.predecessors(pred) + ] + node_gp_successors_streams = [ + succ_of_gp for gp in node_grand_predecessors for succ_of_gp in state.successors(gp) + if isinstance(succ_of_gp, nodes.AccessNode) + and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ_of_gp) == 0 + ] + + # remove duplicates + node_gp_successors_streams = list(set(node_gp_successors_streams)) + + for gp_succ_stream in node_gp_successors_streams: + for edge in state.in_edges(gp_succ_stream): + src, src_conn, _, dst_conn, data = edge + state.add_edge(src, src_conn, combined_stream_node, dst_conn, data) + state.remove_edge(edge) + # Note: the grand-predecessor's successor GPU stream is a sink node and has no + # outgoing edges + state.remove_node(gp_succ_stream) + + def _merge_gpustreams_special_case(self, sdfg: SDFG) -> None: + """ + Special-case simplification of GPU stream AccessNodes. + + This pass detects the following pattern: + - A GPU stream AccessNode `X` has a predecessor and a successor (i.e. at least one of both). + - Between the predecessor and successor lie one or more tasklets. + - These tasklets use their own distinct GPU stream AccessNodes (not `X`), + which are connected only to the tasklet itself. + + To simplify the topology, redundant streams are merged: + - A single unified input GPU stream connects to the predecessor and replaces (merges) + the per-tasklet input streams. + - A single unified output GPU stream connects to the successor and replaces (merges) + the per-tasklet output streams. + + + The simplification is easier to understand visually than in words. + Inspect the intermediate SDFGs produced by the minimal example below + to see the effect of the stream merging. + + Example + ------- + @dace.program + def example(A: dace.uint32[128], B: dace.uint32[128], + C: dace.uint32[128], D: dace.uint32[128]): + for i in dace.map[0:128:1]: + B[i] = A[i] + for i in dace.map[0:128:1]: + D[i] = C[i] + + sdfg = example.to_sdfg() + sdfg.apply_gpu_transformations() + """ + # Get the name of the GPU stream arry + gpustream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + #------------------------- Preprocess: Gather Information ---------------------------- + + # For each GPU Stream AccessNode having a predecessor and a successor: + # Determine with which Tasklet Source and which Tasklet sink nodes lie between its predecessor + # and its successor + merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + + for node, state in sdfg.all_nodes_recursive(): + + # Skip non-tasklets + if not isinstance(node, nodes.Tasklet): + continue + + # The tasklets of interest should have exactly one preceeding source GPU node and one following sink GPU node + # If not, we skip + node_predecessors = state.predecessors(node) + node_successors = state.successors(node) + downstream_gpustream_sinks = [ + succ for succ in node_successors if isinstance(succ, nodes.AccessNode) + and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0 + ] + upstream_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + + # Skip not considered case + if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks) + and len(upstream_gpustream_sources) == 1): + continue + + # Look for potential predecessor of a "passthrough" GPU Stream AccessNode + # which would also be the grand-predeccessor of the current node (=tasklet) + candidate_predecessor = [] + for pred in node_predecessors: + for grand_pred in state.predecessors(pred): + + # Current nodes grand pred is a candidate of a predecessor of a "passthrough" GPU Stream AccessNode + candidate = grand_pred + + # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors + if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule + == dtypes.ScheduleType.GPU_Device or isinstance(candidate, nodes.Tasklet)): + continue + + has_passthrough_gpustream = any( + (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) and ( + state.in_degree(succ) > 0 and state.out_degree(succ) > 0) + for succ in state.successors(candidate)) + + if has_passthrough_gpustream: + candidate_predecessor.append(candidate) + + # Not "close" passthrough GPU node exists if no candidate predecessor exists + if len(candidate_predecessor) == 0: + continue + + # Niche case, more than one "close" passthrough GPU node exists: Out of scope + # Ignore this case (note: This Pass only makes the Graph visually nicer, so skipping has + # no effect on correctness) + if len(candidate_predecessor) > 1: + continue + + # Get the Kernel Exits GPU stream + candidate_predecessor = candidate_predecessor[0] + passthrough_gpu_node = [ + succ for succ in state.successors(candidate_predecessor) + if isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t + ][0] + + # Collect and store the GPU stream merging information + pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] # Note: Len is 1 + succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] # Note: Len is 1 + if (passthrough_gpu_node, state) in merge_source_gpustream: + merge_source_gpustream[(passthrough_gpu_node, state)].append(pre_gpustream) + merge_sink_gpustream[(passthrough_gpu_node, state)].append(succ_gpustream) + else: + merge_source_gpustream[(passthrough_gpu_node, state)] = [pre_gpustream] + merge_sink_gpustream[(passthrough_gpu_node, state)] = [succ_gpustream] + + #------------------------- Merge the GPU Stream AccessNodes ---------------------------- + for passthrough_gpu_node, state in merge_sink_gpustream.keys(): + + # Add new AccessNodes which merge the other loose streams + unified_in_stream = state.add_access(gpustream_array_name) + unified_out_stream = state.add_access(gpustream_array_name) + + for in_edge in state.in_edges(passthrough_gpu_node): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_in_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + + for out_edge in state.out_edges(passthrough_gpu_node): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + + for source_stream in merge_source_gpustream[passthrough_gpu_node, state]: + for out_edge in state.out_edges(source_stream): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_in_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + state.remove_node(source_stream) + + for sink_stream in merge_sink_gpustream[passthrough_gpu_node, state]: + for in_edge in state.in_edges(sink_stream): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_out_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + state.remove_node(sink_stream) + + state.remove_node(passthrough_gpu_node) diff --git a/dace/transformation/passes/gpustream/gpustream_scheduling.py b/dace/transformation/passes/gpustream/gpustream_scheduling.py new file mode 100644 index 0000000000..0ad3c2e7c0 --- /dev/null +++ b/dace/transformation/passes/gpustream/gpustream_scheduling.py @@ -0,0 +1,249 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Dict, List, Set, Type, Union + +import dace +from dace import SDFG, SDFGState, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.sdfg.graph import Graph, NodeT +from dace.transformation import pass_pipeline as ppl, transformation + +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class NaiveGPUStreamScheduler(ppl.Pass): + """ + Assigns GPU streams to nodes and stores the assignments in a dictionary. + This can be useful for enabling asynchronous and parallel GPU computation using GPU streams. + + Strategy Overview: + ------------------ + - GPU stream assignment is based on weakly connected components (WCCs) within each state. + - Nodes in the same WCC are assigned to the same stream. + - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0). + - In nested SDFGs: + * Stream assignment is inherited from the parent component, + * All internal components share the parent's stream. + - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration. + + Example: + -------- + A state with the following independent chains: + K1 → K2 + K3 → K4 → K5 + K6 + + would be scheduled as: + K1, K2 → stream 0 + K3, K4, K5 → stream 1 + K6 → stream 2 + + (assuming no limit on the number of concurrent streams) + + Note: + ----- + These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams. + """ + + def __init__(self): + # Maximum number of concurrent streams allowed (from config). + # Cached locally for frequent reuse. + self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Nothing + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]: + """ + Assigns GPU streams to nodes within the given SDFG. + + Parameters + ---------- + sdfg : SDFG + The top-level SDFG to process. + pipeline_results : Dict + Unused. + + Returns + ------- + Dict[nodes.Node, int] + A dictionary mapping each node to its assigned GPU stream. + """ + stream_assignments: Dict[nodes.Node, int] = dict() + for state in sdfg.states(): + self._assign_gpu_streams_in_state(sdfg, False, state, stream_assignments, 0) + + return stream_assignments + + def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, + stream_assignments: Dict[nodes.Node, int], gpu_stream: int) -> None: + """ + Assigns GPU streams to nodes in a single state. + + If inside a nested SDFG, components inherit the parent's stream. + Otherwise, each connected component gets a different stream. + Nested SDFGs are processed recursively. + + Parameters + ---------- + sdfg : SDFG + The SDFG containing the state. + in_nested_sdfg : bool + True if the state is in a nested SDFG. + state : SDFGState + The state to process. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to assigned GPU streams (updated in-place). + gpu_stream : int + The current GPU stream ID. + + Returns + ------- + None + """ + components = self._get_weakly_connected_nodes(state) + + for component in components: + + if not self._requires_gpu_stream(state, component): + continue + + nodes_assigned_before = len(stream_assignments) + + for node in component: + stream_assignments[node] = gpu_stream + if isinstance(node, nodes.NestedSDFG): + for nested_state in node.sdfg.states(): + self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, stream_assignments, gpu_stream) + + # Move to the next stream if we have assigned streams to any node in this component + # (careful: if nested, states are in same component) + if not in_nested_sdfg and len(stream_assignments) > nodes_assigned_before: + gpu_stream = self._next_stream(gpu_stream) + + def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: + """ + Returns all weakly connected components in the given directed graph. + + A weakly connected component is a maximal group of nodes such that each pair + of nodes is connected by a path when ignoring edge directions. + + Parameters + ---------- + graph: Graph + A directed graph instance. + + Returns + ------- + List[Set[Node_T]] + + A list containing sets of nodes, with each set corresponding to a weakly + connected component. + """ + visited: Set[NodeT] = set() + components: List[Set[NodeT]] = [] + + for node in graph.nodes(): + if node in visited: + continue + + # Start a new weakly connected component + component: Set[NodeT] = set() + stack = [node] + + while stack: + current = stack.pop() + if current in visited: + continue + + visited.add(current) + component.add(current) + + for neighbor in graph.neighbors(current): + if neighbor not in visited: + stack.append(neighbor) + + components.append(component) + + return components + + def _next_stream(self, gpu_stream: int) -> int: + """ + Compute the next CUDA stream index according to the concurrency configuration. + + Behavior depends on the configured max_concurrent_streams value: + - If 0: unlimited streams allowed, so increment the stream index by one. + - If -1: default setting, always return stream 0 (no concurrency). + - Otherwise: cycle through stream indices from 0 up to max_concurrent_streams - 1. + + Parameters + ---------- + gpu_stream : int + The current CUDA stream index. + + Returns + ------- + int + The next CUDA stream index based on the concurrency policy. + """ + if self._max_concurrent_streams == 0: + return gpu_stream + 1 + elif self._max_concurrent_streams == -1: + return 0 + else: + return (gpu_stream + 1) % self._max_concurrent_streams + + def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool: + """ + Check whether a connected component in an SDFG state should be assigned + a GPU stream. + + A component requires a GPU stream if it contains at least one of: + - An AccessNode with GPU global memory storage, + - A MapEntry scheduled on a GPU device, + - A Tasklet whose code includes the stream placeholder. + + Parameters + ---------- + state : SDFGState + The state containing the component. + component : Set[NodeT] + The set of nodes that form the connected component. + + Returns + ------- + bool + True if the component requires a GPU stream, False otherwise. + """ + + def gpu_relevant(node, parent) -> bool: + if (isinstance(node, nodes.AccessNode) and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global): + return True + + elif (isinstance(node, nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device): + return True + + elif (isinstance(node, nodes.Tasklet) and STREAM_PLACEHOLDER in node.code.as_string): + return True + + return False + + for node in component: + if isinstance(node, nodes.NestedSDFG): + if any(gpu_relevant(node, parent) for node, parent in node.sdfg.all_nodes_recursive()): + return True + + else: + if gpu_relevant(node, state): + return True + + return False diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py new file mode 100644 index 0000000000..b4a7b9a65d --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -0,0 +1,288 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import dtypes, properties, SDFG, SDFGState +from dace.codegen import common +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamSyncTasklets(ppl.Pass): + """ + Inserts GPU stream synchronization tasklets in an SDFG where needed. + + This pass uses a heuristic approach to find locations matching specific patterns + that require synchronization. Additional locations can be added easily if new + cases are discovered. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Inserts GPU stream synchronization tasklets at required locations + after certain nodes and at the end of a state, for GPU streams used in the state. + """ + stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler'] + + # Get sync locations + sync_state, sync_node = self._identify_sync_locations(sdfg, stream_assignments) + + # Synchronize after a node when required + self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments) + + # Synchronize all used streams at the end of a state + self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments) + return {} + + def _identify_sync_locations( + self, sdfg: SDFG, + stream_assignments: Dict[nodes.Node, int]) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: + """ + Heuristically identifies GPU stream synchronization points in an SDFG. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream ids. + + Returns + ------- + Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]] + - **sync_state**: Maps each state to the set of stream IDs that should be + synchronized at the end of the state. + - **sync_node**: The keys of this dictionary are nodes after which synchronization + is needed, and their corresponding value is the state they belong to. + """ + + # ------------------ Helper predicates ----------------------------- + + def is_gpu_global_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage == dtypes.StorageType.GPU_Global + + def is_nongpu_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN + + def is_kernel_exit(node): + return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device + + def is_sink_node(node, state): + return state.out_degree(node) == 0 + + def edge_within_kernel(state, src, dst): + gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN + src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) + dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) + return src_in_kernel and dst_in_kernel + + def is_tasklet_with_stream_use(src): + return isinstance(src, nodes.Tasklet) and STREAM_PLACEHOLDER in src.code.as_string + + # ------------------ Sync detection logic ----------------------------- + + sync_state: Dict[SDFGState, Set[int]] = {} + sync_node: Dict[nodes.Node, SDFGState] = {} + + for edge, state in sdfg.all_edges_recursive(): + src, dst = edge.src, edge.dst + + # Ensure state is initialized in sync_state + if state not in sync_state: + sync_state[state] = set() + + # --- Heuristics for when to sync --- + if (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_node[dst] = state + sync_state[state].add(stream_assignments[dst]) + + elif (is_nongpu_accessnode(src, state) and is_gpu_global_accessnode(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and not is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[src]) + sync_state[state].add(stream_assignments[src]) + + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[dst]) + + elif is_tasklet_with_stream_use(src): + sync_state[state].add(stream_assignments[src]) + + else: + continue + + # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side + if not isinstance(state, SDFGState): + raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " + "Expected 'SDFGState'. Please handle this case explicitly.") + + # Remove states with no syncs + sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0} + + return sync_state, sync_node + + def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Inserts GPU stream synchronization tasklets at the end of SDFG states. + + For each state that requires synchronization, this method: + + 1. Generates a tasklet that synchronizes all assigned GPU streams using + the appropriate backend (e.g., CUDA). + 2. Ensures all other operations in the state complete before synchronization + by connecting all sink nodes to the tasklet. + 3. Guarantees that only a single GPU stream AccessNode connects to the sync + tasklet, creating one if needed. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_state : Dict[SDFGState, Set[int] + Mapping of states to sets of stream IDs that require synchronization at the end of the state. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for state, streams in sync_state.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Build synchronization calls for all streams used in this state + sync_code_lines = [] + for stream in streams: + gpu_stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({gpu_stream_var_name}));" + sync_code_lines.append(sync_call) + sync_code = "\n".join(sync_code_lines) + + # Create the tasklet + tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", + inputs=set(), + outputs=set(), + code=sync_code, + language=dtypes.Language.CPP) + + # ----------------- Connect sink nodes to the synchronization tasklet ----------------- + + # 1. Seperate GPU stream sink nodes and other sink nodes + stream_sink_nodes: List[nodes.AccessNode] = [] + non_stream_sink_nodes: List[nodes.Node] = [] + for sink_node in state.sink_nodes(): + if isinstance(sink_node, nodes.AccessNode) and sink_node.desc(state).dtype == dtypes.gpuStream_t: + stream_sink_nodes.append(sink_node) + + elif sink_node != tasklet: + non_stream_sink_nodes.append(sink_node) + + # 2. Connect non-stream sink nodes to the sync tasklet + for sink_node in non_stream_sink_nodes: + state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) + + # 3. Connect a single GPU stream sink node (create or merge if needed) + if len(stream_sink_nodes) == 0: + combined_stream_node = state.add_access(stream_array_name) + + else: + combined_stream_node = stream_sink_nodes.pop() + for stream_node in stream_sink_nodes: + for edge in state.in_edges(stream_node): + state.add_edge(edge.src, edge.src_conn, combined_stream_node, edge.dst_conn, edge.data) + state.remove_edge(edge) + state.remove_node(stream_node) + + # Connect back to output stream node + output_stream_node = state.add_access(combined_stream_node.data) + for stream in streams: + accessed_gpu_stream = f"{stream_array_name}[{stream}]" + conn = f"{stream_var_name_prefix}{stream}" # Note: Same as "gpu_stream_var_name" from tasklet + + tasklet.add_in_connector(conn, dtypes.gpuStream_t) + tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True) + state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) + + def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Insert a GPU stream synchronization tasklet immediately after specified nodes. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_node : Dict[nodes.Node, SDFGState] + Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for node, state in sync_node.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Get assigned GPU stream + stream = stream_assignments.get(node, "nullptr") + if stream == "nullptr": + raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.") + + # Create the tasklet + stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n" + tasklet = state.add_tasklet( name=f"gpu_stream_{stream}_synchronization", + inputs=set(), outputs=set(), + code=sync_call, language=dtypes.Language.CPP) + + + #----------------- Place tasklet between node and successors, link GPU streams ---------------- + + # 1. Put the tasklet between the node and its successors + for succ in state.successors(node): + state.add_edge(tasklet, None, succ, None, dace.Memlet()) + state.add_edge(node, None, tasklet, None, dace.Memlet()) + + # 2. Connect tasklet to GPU stream AccessNodes + in_stream = state.add_access(stream_array_name) + out_stream = state.add_access(stream_array_name) + accessed_stream = f"{stream_array_name}[{stream}]" + state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream)) + state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream)) + tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True) + tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) + \ No newline at end of file diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py new file mode 100644 index 0000000000..23bb4c7c94 --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py @@ -0,0 +1,70 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToKernels(ppl.Pass): + """ + This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps). + + Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes, + indicating which GPU stream each kernel is assigned to. These assignments are e.g. + used when launching the kernels. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + # Retrieve the GPU stream array name and the prefix for individual stream variables + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Link kernels to their assigned GPU streams + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a kernel entry - continue + if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device): + continue + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}" + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Assign the GPU stream to the kernel entry + kernel_entry = node + kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_in = state.add_access(stream_array_name) + state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name, + dace.Memlet(accessed_gpu_stream)) + + # Assign the GPU stream to the kernel exit + kernel_exit = state.exit_node(kernel_entry) + kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_out = state.add_access(stream_array_name) + state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None, + dace.Memlet(accessed_gpu_stream)) + + return {} diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py new file mode 100644 index 0000000000..b55e4889a1 --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py @@ -0,0 +1,155 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import SDFG, dtypes, properties +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types +from dace.config import Config +from dace.sdfg import is_devicelevel_gpu +from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, Node, Tasklet +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler + +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToSDFGs(ppl.Pass): + """ + Inserts a GPU stream array into the top-level SDFG and propagates it to all + nested SDFGs that require it, including intermediate SDFGs along the hierarchy. + + This pass guarantees that every relevant SDFG has the array defined, avoiding + duplication and allowing subsequent passes in the GPU stream pipeline to rely + on its presence without redefining it. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Ensure that a GPU stream array is available in all SDFGs that require it. + + The pass creates the array once at the top-level SDFG and propagates it + down the hierarchy by inserting matching arrays in child SDFGs and wiring + them through nested SDFG connectors. This way, all SDFGs share a consistent + reference to the same GPU stream array. + """ + + # Extract stream array name and number of streams to allocate + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + stream_assignments: Dict[Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + + # Add the GPU stream array at the top level + sdfg.add_transient(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Ensure GPU stream array is defined where required + for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg): + + # Skip if this child already has the array (inserted higher up in the hierarchy) + if stream_array_name in child_sdfg.arrays: + continue + + # Add the array to the child SDFG + inner_sdfg = child_sdfg + inner_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Walk up the hierarchy until the array is found, inserting it into each parent + outer_sdfg = inner_sdfg.parent_sdfg + while stream_array_name not in outer_sdfg.arrays: + + # Insert array in parent SDFG + outer_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Connect parent SDFG array to nested SDFG node + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(stream_array_name)) + + # Continue climbing up the hierarchy + inner_sdfg = outer_sdfg + outer_sdfg = outer_sdfg.parent_sdfg + + # Ensure final connection from the first parent that had the array down to this SDFG + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]")) + + outer_sdfg = inner_sdfg.parent_sdfg + + return {} + + def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: + """ + Identify all child SDFGs that require a GPU stream array in their + array descriptor store. A child SDFG requires a GPU stream if: + + - It launches GPU kernels (MapEntry/MapExit with GPU_Device schedule). + - It contains special Tasklets (e.g., from library node expansion) that + use the GPU stream they are assigned to in the code. + - It accesses GPU global memory outside device-level GPU scopes, which + implies memory copies or kernel data feeds. + + Parameters + ---------- + sdfg : SDFG + The root SDFG to inspect. + + Returns + ------- + Set[SDFG] + The set of child SDFGs that need a GPU stream array in their array descriptor + store. + """ + requiring_gpu_stream = set() + for child_sdfg in sdfg.all_sdfgs_recursive(): + + # Skip the root SDFG itself + if child_sdfg is sdfg: + continue + + for state in child_sdfg.states(): + for node in state.nodes(): + + # Case 1: Kernel launch nodes + if isinstance(node, (MapEntry, MapExit)) and node.map.schedule == dtypes.ScheduleType.GPU_Device: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 2: Tasklets that use GPU stream in their code + if isinstance(node, Tasklet) and STREAM_PLACEHOLDER in node.code.as_string: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 3: Accessing GPU global memory outside device-level scopes + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Global + and not is_devicelevel_gpu(state.sdfg, state, node)): + requiring_gpu_stream.add(child_sdfg) + break + + # Stop scanning this SDFG once a reason is found + if child_sdfg in requiring_gpu_stream: + break + + return requiring_gpu_stream diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py new file mode 100644 index 0000000000..1438472da0 --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py @@ -0,0 +1,80 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels + +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToTasklets(ppl.Pass): + """ + This pass ensures that tasklets which require access to their assigned GPU stream + are provided with it explicitly. + + Such tasklets typically originate from expanded LibraryNodes targeting GPUs. + These nodes may reference the special placeholder variable `__dace_current_stream`, + which is expected to be defined during unparsing in `cpp.py`. + + To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use + the GPU stream AccessNode directly. + + Note that this pass is similar to `InsertGPUStreamsToKernels`. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + # Retrieve the GPU stream's array name + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code + # and provide them the needed GPU stream explicitly + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a tasklet - continue + if not isinstance(node, nodes.Tasklet): + continue + + # Tasklet does not need use its assigned GPU stream - continue + if not STREAM_PLACEHOLDER in node.code.as_string: + continue + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_conn = STREAM_PLACEHOLDER + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Provide the GPU stream explicitly to the tasklet + stream_array_in = state.add_access(stream_array_name) + stream_array_out = state.add_access(stream_array_name) + + node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t) + node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True) + + state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) + + return {} diff --git a/dace/transformation/passes/insert_gpu_copy_tasklets.py b/dace/transformation/passes/insert_gpu_copy_tasklets.py new file mode 100644 index 0000000000..447adc7767 --- /dev/null +++ b/dace/transformation/passes/insert_gpu_copy_tasklets.py @@ -0,0 +1,166 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace import memlet as mm +from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy +from dace.config import Config +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets +from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUCopyTasklets(ppl.Pass): + """ + This pass inserts explicit copy tasklets for data transfers that need to be handled + by the GPU and occur outside a kernel (for example, copying data from host memory + to the GPU before executing a kernel). + + It identifies such copy locations and inserts the corresponding tasklets. For each + memlet path describing a copy, the first edge is duplicated: one edge goes from the original + source to the tasklet, and the other from the tasklet to the original destination, while + the original edge is removed. + + This is experimental and could later serve as inspiration for making all copies explicit. + Considerations for future work include allowing tasklets to access array addresses + from connectors and describing in memlets how data will be moved, since currently + tasklets only support value inputs. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets + } + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: + """ + Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling. + Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel + function. + + Parameters + ---------- + sdfg : SDFG + The SDFG to transform by adding out-of-kernel GPU copy tasklets. + pipeline_results : Dict[str, Any] + Results from previous transformation passes, including GPU stream assignments. + + Returns + ------- + dict + Currently returns an empty dictionary. + """ + # Prepare GPU stream + gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + gpustream_array_name, gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Initialize the strategy for copies that occur outside of kernel execution + out_of_kernel_copy = OutOfKernelCopyStrategy() + + # Get all data copies to process the out of kernel copies + copy_worklist = self.find_all_data_copies(sdfg) + + for copy_sdfg, state, src_node, dst_node, edge in copy_worklist: + + copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments) + + # Only insert copy tasklets for GPU related copies occuring out of the + # kernel (i.e. a GPU_device scheduled map) + if not out_of_kernel_copy.applicable(copy_context): + continue + + # Generatae the copy call + code = out_of_kernel_copy.generate_copy(copy_context) + + # Prepare GPU ustream connectors and the stream to be accessed from the + # GPU stream array + gpustream_id = gpustream_assignments[dst_node] + gpustream_var_name = f"{gpustream_var_name_prefix}{gpustream_id}" + accessed_gpustream = f"{gpustream_array_name}[{gpustream_id}]" + + # Create the tasklet and add GPU stream related connectors + tasklet = state.add_tasklet("gpu_copy", {}, {}, code, language=dtypes.Language.CPP) + tasklet.add_in_connector(gpustream_var_name, dtypes.gpuStream_t, True) + tasklet.add_out_connector(gpustream_var_name, dtypes.gpuStream_t, True) + + # Add incoming and outgoing GPU stream accessNodes to the tasklet + in_gpustream = state.add_access(gpustream_array_name) + out_gpustream = state.add_access(gpustream_array_name) + state.add_edge(in_gpustream, None, tasklet, gpustream_var_name, dace.Memlet(accessed_gpustream)) + state.add_edge(tasklet, gpustream_var_name, out_gpustream, None, dace.Memlet(accessed_gpustream)) + + # Put the tasklet in between the edge + dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge + state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet)) + state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(edge) + + return {} + + def find_all_data_copies( + self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]: + """ + Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node, + destination node, and the first memlet edge of in the memlet path between source and destination node. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze for potential data copies. + + Returns + ------- + List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] + A list of tuples representing the data copy, each containing: + - The SDFG containing the copy + - The state in which the copy occurs + - The source node of the copy + - The destination node of the copy + - The first memlet edge representing the data movement + """ + copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = [] + visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set() + + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for edge in state.edges(): + + # Skip edges that were already processed + if edge in visited_edges: + continue + + # Get the memlet path and mark all edges in the path as visited + memlet_path = state.memlet_path(edge) + visited_edges.update(set(memlet_path)) + + # Get source and destination noces + first_edge = memlet_path[0] + last_edge = memlet_path[-1] + src_node = first_edge.src + dst_node = last_edge.dst + + # Skip empty memlets + if first_edge.data.subset is None: + continue + + # Add copy to the worklist + copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) + + return copy_worklist diff --git a/dace/transformation/passes/move_array_out_of_kernel.py b/dace/transformation/passes/move_array_out_of_kernel.py new file mode 100644 index 0000000000..bd7e401187 --- /dev/null +++ b/dace/transformation/passes/move_array_out_of_kernel.py @@ -0,0 +1,901 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Dict, FrozenSet, Set, Tuple, List, Optional +import copy +import functools +from collections import deque + +import sympy + +import dace +from dace import SDFG, SDFGState, dtypes, data as dt +from dace.sdfg import nodes +from dace.properties import make_properties +from dace.transformation import transformation, helpers +from dace.transformation.pass_pipeline import Pass +from dace.subsets import Range +from dace.sdfg.graph import MultiConnectorEdge +from dace.memlet import Memlet +from dace.symbolic import symbol + +import dace.sdfg.utils as sdutil + + +@make_properties +@transformation.explicit_cf_compatible +class MoveArrayOutOfKernel(Pass): + """ + This pass supports a legacy use case in the 'ExperimentalCUDACodeGen' backend: the use of + transient arrays with dtypes.StorageType.GPU_Global inside GPU_Device scheduled maps (kernels). + Previously, the old 'CUDACodeGen' moved such arrays outside the kernel during codegen, which caused: + + 1. Mismatches between the SDFG and the generated code, + 2. Complex, misplaced logic in codegen, + 3. Incorrect semantics — a single shared array was reused instead of per-iteration replication, + leading to race conditions. + + This pass fixes these issues by explicitly lifting such arrays out of GPU_Device maps + and creating disjoint arrays per map iteration. Unlike the legacy approach, the transformation + is now visible and consistent at the SDFG level, avoiding naming collisions and improving clarity. + + NOTE: There is no true "local device (GPU_Device) memory" on GPUs, but DaCe supports this + pattern for legacy reasons. This pass exists purely for backward compatibility, and its use + is strongly discouraged. + """ + + def __init__(self): + """ + Initializes caches for mapping nodes to their states and SDFGs. + + This avoids repeatedly traversing the SDFG structure during the pass. + The caches are populated in `apply_pass` for convenience. + """ + self._node_to_state_cache: Dict[nodes.Node, SDFGState] = dict() + self._node_to_sdfg_cache: Dict[nodes.Node, SDFG] = dict() + + # Entry point + def apply_pass(self, root_sdfg: SDFG, kernel_entry: nodes.MapEntry, array_name: str) -> None: + """ + Applies the pass to move a transient GPU_Global array out of a GPU_Device map. + + Args: + root_sdfg: The top-level SDFG to operate on. + kernel_entry: The MapEntry node representing the GPU_Device scheduled map (i.e., the kernel) + that contains the transient array. + array_name: The name of the transient array to move. Note that multiple arrays with the + same name may exist within the kernel. All will be lifted. + """ + # Cache every nodes parent state and parent sdfg + for node, parent in root_sdfg.all_nodes_recursive(): + if isinstance(node, nodes.Node): + assert isinstance(parent, SDFGState) + self._node_to_state_cache[node] = parent + self._node_to_sdfg_cache[node] = parent.sdfg + + # Check if all access nodes to 'array_name' within the kernel are defined in the same SDFG as the map + kernel_parent_sdfg = self._node_to_sdfg_cache[kernel_entry] + simple_case = True + for (_, outermost_sdfg, _, _) in self.collect_array_descriptor_usage(kernel_entry, array_name): + if outermost_sdfg != kernel_parent_sdfg: + simple_case = False + break + + if simple_case: + # All access nodes are in the same SDFG as the kernel map - easy + access_nodes = [an for an, _, _ in self.get_access_nodes_within_map(kernel_entry, array_name)] + self.move_array_out_of_kernel_flat(kernel_entry, array_name, access_nodes) + else: + # Access nodes span nested maps or SDFGs — more involved (more checks, naming conflicts, several seperate + # array descriptors with the same array_name) + self.move_array_out_of_kernel_nested(kernel_entry, array_name) + + # Main transformation algorithms and helpers + def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name: str, + access_nodes: List[nodes.AccessNode]) -> None: + """ + Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the flat case. + + This function handles the simpler case where all access nodes to the array are in the same + SDFG and state as the kernel map. Therefore, there are no nested SDFGs or naming conflicts + (since an SDFG cannot define multiple descriptors with the same name). + + The array is reshaped to allocate a disjoint slice per map iteration. For example, given: + + for x, y in dace.map[0:128, 0:32] @ GPU_Device: + gpu_A = dace.define_local([64], dtype, storage=GPU_Global) + + the array shape will be updated to [128, 32, 64], and memlets will ensure each thread + accesses [x, y, 0:64]. + + Additionally, this method inserts the necessary access nodes and edges to correctly move + the array out of the map scope and maintain correctness. + + Args: + kernel_entry: The MapEntry node representing the GPU kernel. + array_name: Name of the transient array to move. + access_nodes: List of access nodes referring to the array inside the map. + """ + # A closest AccessNode of kernel exit is used + parent_state = self._node_to_state_cache[kernel_entry] + kernel_exit: nodes.MapExit = parent_state.exit_node(kernel_entry) + closest_an = self.get_nearest_access_node(access_nodes, kernel_exit) + array_desc = closest_an.desc(parent_state) + + # Get the chain of MapEntries from the AccessNode up to and including the kernel map entry + map_entry_chain, _ = self.get_maps_between(kernel_entry, closest_an) + + # Store the original full-range subset of the array. + # Needed to define correct memlets when moving the array out of the kernel. + old_subset = [(0, dim - 1, 1) for dim in array_desc.shape] + + # Update the array + new_shape, new_strides, new_total_size, new_offsets = self.get_new_shape_info(array_desc, map_entry_chain) + array_desc.set_shape(new_shape=new_shape, strides=new_strides, total_size=new_total_size, offset=new_offsets) + + # Update all memlets + self.update_memlets(kernel_entry, array_name, closest_an, access_nodes) + + # add new edges to move access Node out of map + in_connector: str = 'IN_' + array_name + out_connector: str = 'OUT_' + array_name + previous_node = closest_an + previous_out_connector = None + for next_map_entry in map_entry_chain: + + next_map_exit = parent_state.exit_node(next_map_entry) + if in_connector not in next_map_exit.in_connectors: + next_map_state = self._node_to_state_cache[next_map_exit] + next_map_exit.add_in_connector(in_connector) + next_map_exit.add_out_connector(out_connector) + + next_entries, _ = self.get_maps_between(kernel_entry, previous_node) + + next_map_state.add_edge(previous_node, previous_out_connector, next_map_exit, in_connector, + Memlet.from_array(array_name, array_desc)) + + previous_node = next_map_exit + previous_out_connector = out_connector + + # New Access Node outside of the target map, connected to the exit + access_node_outside = parent_state.add_access(array_name) + parent_state.add_edge(kernel_exit, out_connector, access_node_outside, None, + Memlet.from_array(array_name, array_desc)) + + def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_name: str) -> None: + """ + Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the nested case. + + This function handles the more complex scenario where access nodes to the array may be + defined inside nested SDFGs within the kernel's parent SDFG. It moves the array out of + all nested maps and SDFGs, updating shapes and memlets accordingly, and resolves naming + conflicts that arise from multiple descriptors with the same name in different scopes + (by renaming). + + The method also ensures that the array is correctly lifted through all nested SDFGs + between its original definition and the kernel map, updating symbols and connectors + along the way. + + Args: + kernel_entry: The MapEntry node representing the GPU kernel. + array_name: Name of the transient array to move. + """ + # Collect all information about every distinct data descriptor with the same name "array_name" + array_descriptor_usage = self.collect_array_descriptor_usage(kernel_entry, array_name) + original_array_name = array_name + kernel_parent_sdfg = self._node_to_sdfg_cache[kernel_entry] + + for array_desc, outermost_sdfg, sdfg_defined, access_nodes in array_descriptor_usage: + + if outermost_sdfg == kernel_parent_sdfg: + # Special case: There are nested accesss nodes, but their descriptor is defined at + # the same sdfg as the kernel. Thus, we can use the simpler algorithm. + self.move_array_out_of_kernel_flat(kernel_entry, original_array_name, list(access_nodes)) + continue + + # The outermost node + nsdfg_node = outermost_sdfg.parent_nsdfg_node + map_entry_chain, _ = self.get_maps_between(kernel_entry, nsdfg_node) + + # Store the original full-range subset of the array. + # Needed to define correct memlets when moving the array out of the kernel. + old_subset = [(0, dim - 1, 1) for dim in array_desc.shape] + + # Update array_descriptor + new_shape, new_strides, new_total_size, new_offsets = self.get_new_shape_info(array_desc, map_entry_chain) + array_desc.set_shape(new_shape=new_shape, + strides=new_strides, + total_size=new_total_size, + offset=new_offsets) + array_desc.transient = False + + # Update memlets data movement + self.update_memlets(kernel_entry, original_array_name, nsdfg_node, access_nodes) + + # Update name if names conflict + required, array_name = self.new_name_required(kernel_entry, original_array_name, sdfg_defined) + if required: + self.replace_array_name(sdfg_defined, original_array_name, array_name, array_desc) + + # Ensure required symbols are defined + self.update_symbols(map_entry_chain, kernel_parent_sdfg) + + # Collect all SDFGs from the outermost definition to the target map's parent (inclusive) + sdfg_hierarchy: List[SDFG] = [outermost_sdfg] + current_sdfg = outermost_sdfg + while current_sdfg != kernel_parent_sdfg: + current_sdfg = current_sdfg.parent_sdfg + sdfg_hierarchy.append(current_sdfg) + + # Validate collected SDFGs: no None entries + if any(sdfg is None for sdfg in sdfg_hierarchy): + raise ValueError("Invalid SDFG hierarchy: contains 'None' entries. This should not happen.") + + # Validate depth: must include at least outer + target SDFG + if len(sdfg_hierarchy) < 2: + raise ValueError(f"Invalid SDFG hierarchy: only one SDFG found. " + f"Expected at least two levels, since {outermost_sdfg} is not equal to " + "the kernel map's SDFG and is contained within it — the last entry should " + "be the kernel's parent SDFG.") + + self.lift_array_through_nested_sdfgs(array_name, kernel_entry, sdfg_hierarchy, old_subset) + + def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.MapEntry, sdfg_hierarchy: List[SDFG], + old_subset: List) -> None: + """ + Lifts a transient array through nested SDFGs. + + For each SDFG in the hierarchy (from inner to outer), this deepcopies the array descriptor + and adds edges from the NestedSDFG node through any enclosing maps to a new access node. + This is done until the kernel is exited. + Memlets are updated using `old_subset` and enclosing map parameters. + + Args: + array_name: Name of the array to lift. + kernel_entry: Innermost GPU kernel MapEntry. + sdfg_hierarchy: Ordered list of nested SDFGs (inner to outer). + old_subset: Inner array subset used for memlet construction. + """ + # Move array out ouf the kernel map entry through nested SDFGs + outer_sdfg = sdfg_hierarchy.pop(0) + while sdfg_hierarchy: + inner_sdfg = outer_sdfg + outer_sdfg = sdfg_hierarchy.pop(0) + nsdfg_node = inner_sdfg.parent_nsdfg_node + nsdfg_parent_state = self._node_to_state_cache[nsdfg_node] + + # copy and add the descriptor to the outer sdfg + old_desc = inner_sdfg.arrays[array_name] + new_desc = copy.deepcopy(old_desc) + outer_sdfg.add_datadesc(array_name, new_desc) + + # Get all parent scopes to detect how the data needs to flow. + # E.g. nsdfg_node -> MapExit needs to be nsdfg_node -> MapExit -> AccessNode (new) + parent_scopes: List[nodes.MapEntry] = [] + current_parent_scope = nsdfg_node + scope_dict = nsdfg_parent_state.scope_dict() + while scope_dict[current_parent_scope] is not None and current_parent_scope is not kernel_entry: + parent_scopes.append(scope_dict[current_parent_scope]) + current_parent_scope = scope_dict[current_parent_scope] + + # Get a new AccessNode where the nsdfg node's parent state is. + # Note: This is in the OUTER sdfg, so this is the first accessNode accessing + # the current array descriptor + exit_access_node = nsdfg_parent_state.add_access(array_name) + + # Cache its location + self._node_to_state_cache[exit_access_node] = nsdfg_parent_state + self._node_to_sdfg_cache[exit_access_node] = outer_sdfg + + # Create a dataflow path from the NestedSDFG node to the new exit access node, + # passing through any enclosing map scopes (if the NestedSDFG is nested within maps). + src = nsdfg_node + for scope_entry in parent_scopes: + # next destination is the scope exit + scope_exit = nsdfg_parent_state.exit_node(scope_entry) + dst = scope_exit + + # Next, add edge between src and dst in 2 steps: + # 1.1 Determine source connector name and register it based on src type + if isinstance(src, nodes.NestedSDFG): + src_conn = array_name + src.add_out_connector(src_conn) + elif isinstance(src, nodes.MapExit): + src_conn = f"OUT_{array_name}" + src.add_out_connector(src_conn) + else: + raise NotImplementedError( + f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected." + ) + + # 1.2 Determine destination connector name and register it based on dst type + if isinstance(dst, nodes.AccessNode): + dst_conn = None # AccessNodes use implicit connectors + elif isinstance(dst, nodes.MapExit): # Assuming dst is the entry for parent scope + dst_conn = f"IN_{array_name}" + dst.add_in_connector(dst_conn) + else: + raise NotImplementedError( + f"Unsupported destination node type '{type(dst).__name__}' — expected AccessNode or MapEntry.") + + # 2. Add the edge using the connector names determined in Step 1. + next_entries, _ = self.get_maps_between(kernel_entry, src) + memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) + nsdfg_parent_state.add_edge(src, src_conn, dst, dst_conn, Memlet.from_array(array_name, new_desc)) + + # Continue by setting the dst as source + src = dst + + # After processing all scopes, the last src (which is either the last MapExit or the intial nsdfg if there are no parent scope) + # needs to be connected to the exit access node added before + dst = exit_access_node + + if isinstance(src, nodes.NestedSDFG): + src_conn = array_name + src.add_out_connector(src_conn) + elif isinstance(src, nodes.MapExit): + src_conn = f"OUT_{array_name}" + src.add_out_connector(src_conn) + else: + raise NotImplementedError( + f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") + + next_entries, _ = self.get_maps_between(kernel_entry, src) + memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) + nsdfg_parent_state.add_edge(src, src_conn, dst, None, Memlet.from_array(array_name, new_desc)) + + # At the outermost sdfg we set the array descriptor to be transient again, + # Since it is not needed beyond it. Furthermore, this ensures that the codegen + # allocates the array and does not expect it as input to the kernel + new_desc.transient = True + + # Memlet related helper functions + def get_memlet_subset(self, map_chain: List[nodes.MapEntry], node: nodes.Node): + """ + Compute the memlet subset to access an array based on the position of a node within nested GPU maps. + + For each GPU_Device or GPU_ThreadBlock map in the chain: + - If the node lies inside the map (but is not the map entry or exit itself), + the subset is the single index corresponding to the map parameter (symbolic). + - Otherwise, the full range of the map dimension is used. + + This ensures that memlets correctly represent per-thread or per-block slices + when moving arrays out of kernel scopes. + + Args: + map_chain: List of MapEntry nodes representing nested maps from outermost to innermost. + node: The node for which to determine the subset (could be an access node or map entry/exit). + + Returns: + A list of subsets (start, end, stride) tuples for each map dimension. + """ + subset = [] + for next_map in map_chain: + if not next_map.map.schedule in [dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock]: + continue + + map_parent_state = self._node_to_state_cache[next_map] + for param, (start, end, stride) in zip(next_map.map.params, next_map.map.range.ndrange()): + + node_is_map = ((isinstance(node, nodes.MapEntry) and node == next_map) + or (isinstance(node, nodes.MapExit) and map_parent_state.exit_node(next_map) == node)) + node_state = self._node_to_state_cache[node] + if helpers.contained_in(node_state, node, next_map) and not node_is_map: + index = symbol(param) + subset.append((index, index, 1)) + else: + subset.append((start, end, stride)) + + return subset + + def update_memlets(self, kernel_entry: nodes.MapEntry, array_name: str, outermost_node: nodes.Node, + access_nodes: Set[nodes.AccessNode]) -> None: + """ + Updates all memlets related to a given transient array to reflect correct data + movement when moving array out of the kernel entry. + + Any map enclosing the `outermost_node` also encloses all access nodes and is + used to determine which maps are strictly above the access nodes. Based on this, + we compute the correct memlet subset that includes the additional dimensions + from the GPU map hierarchy. + + Args: + kernel_entry: The MapEntry node representing the GPU kernel scope. + array_name: Name of the transient array being moved out. + outermost_node: The outermost node. + access_nodes: Set of AccessNodes inside the kernel that reference the same array. + """ + map_entry_chain, _ = self.get_maps_between(kernel_entry, outermost_node) + params_as_ranges = self.get_memlet_subset(map_entry_chain, outermost_node) + + # Update in and out path memlets + visited: Set[MultiConnectorEdge[Memlet]] = set() + for access_node in access_nodes: + # in paths + for path in self.in_paths(access_node): + for edge in path: + + # Guards + if edge in visited: + continue + + if edge.data.data == array_name: + old_range = edge.data.subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.subset = Range(new_range) + visited.add(edge) + + elif edge.data.data != array_name and edge.dst is access_node and edge.data.dst_subset is not None: + old_range = edge.data.dst_subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.dst_subset = Range(new_range) + visited.add(edge) + + else: + continue + + # out paths + for path in self.out_paths(access_node): + for edge in path: + if edge in visited: + continue + + if edge.data.data == array_name: + old_range = edge.data.subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.subset = Range(new_range) + visited.add(edge) + + elif (edge.data.data + != array_name) and edge.src is access_node and edge.data.src_subset is not None: + old_range = edge.data.src_subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.src_subset = Range(new_range) + visited.add(edge) + + else: + continue + + # Array, symbol and renaming related helper functions + def get_new_shape_info(self, array_desc: dt.Array, map_exit_chain: List[nodes.MapEntry]): + """ + Calculate the new shape, strides, total size, and offsets for a transient array + when moving it out of a GPU_Device kernel. + + Each GPU_Device map adds dimensions to allocate disjoint slices per thread. + + For example: + + for x, y in dace.map[0:128, 0:32] @ GPU_Device: + gpu_A = dace.define_local([64], dtype, storage=GPU_Global) + + gpu_A's shape changes from [64] to [128, 32, 64] to give each thread its own slice + (i.e. gpu_A[x, y, 64]). + + Args: + array_desc: Original array descriptor. + map_exit_chain: List of MapEntry nodes between array and kernel exit. + + Returns: + Tuple (new_shape, new_strides, new_total_size, new_offsets) for the updated array. + """ + extended_size = [] + new_strides = list(array_desc.strides) + new_offsets = list(array_desc.offset) + for next_map in map_exit_chain: + if not next_map.map.schedule in [dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock]: + continue + + map_range: Range = next_map.map.range + max_elements = map_range.max_element() + min_elements = map_range.min_element() + range_size = [max_elem + 1 - min_elem for max_elem, min_elem in zip(max_elements, min_elements)] + + #TODO: check this / clean (maybe support packed C and packed Fortran layouts separately for code readability future) + old_total_size = array_desc.total_size + accumulator = old_total_size + new_strides.insert(0, old_total_size) + for cur_range_size in range_size[:-1]: + new_strides.insert(0, accumulator) # insert before (mult with volumes) + accumulator = accumulator * cur_range_size + + extended_size = range_size + extended_size + #new_strides = [1 for _ in next_map.map.params] + new_strides # add 1 per dimension + new_offsets = [0 for _ in next_map.map.params] + new_offsets # add 0 per dimension + + new_shape = extended_size + list(array_desc.shape) + new_total_size = functools.reduce(sympy.Mul, extended_size, 1) * array_desc.total_size + + return new_shape, new_strides, new_total_size, new_offsets + + # TODO: Ask Yakup -> No states test but this should be alright + def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: str, array_desc: dt.Array) -> None: + """ + Replaces all occurrences of an array name in the given SDFGs, including its data descriptor, + memlets, connectors and access nodes with a new name. + + Args: + sdfgs (Set[SDFG]): The SDFGs in which to perform the renaming. + old_name (str): The original array name to be replaced. + new_name (str): The new array name. + new_descriptor (dt.Array): The data descriptor associated with the old and new name. + """ + for sdfg in sdfgs: + + # Replace by removing the data descriptor and adding it with the new name + sdfg.remove_data(old_name, False) + sdfg.add_datadesc(new_name, array_desc) + sdfg.replace(old_name, new_name) + + # Find all states + for state in sdfg.states(): + for edge in state.edges(): + + # Update out connectors + src = edge.src + old_out_conn = f"OUT_{old_name}" + new_out_conn = f"OUT_{new_name}" + if edge.src_conn == old_out_conn: + edge.src_conn = new_out_conn + src.remove_out_connector(old_out_conn) + src.add_out_connector(new_out_conn) + + # Update in connectors + dst = edge.dst + old_in_conn = f"IN_{old_name}" + new_in_conn = f"IN_{new_name}" + if edge.dst_conn == old_in_conn: + edge.dst_conn = new_in_conn + dst.remove_in_connector(old_in_conn) + dst.add_in_connector(new_in_conn) + + def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) -> None: + """ + Ensures symbols from GPU maps are defined in all nested SDFGs. + + When lifting arrays out of GPU maps, any used symbols (e.g., map indices) + must be available in nested SDFGs for correct memlet updates. + This function collects such symbols from the map scopes and adds them to + the symbol tables and mappings of all nested SDFGs under `top_sdfg`. + + Args: + map_entry_chain: List of GPU MapEntry nodes whose symbols are relevant. + top_sdfg: The top-level SDFG under which symbols will be propagated. + """ + all_symbols = set() + for next_map in map_entry_chain: + if not next_map.map.schedule in [ + dace.dtypes.ScheduleType.GPU_Device, dace.dtypes.ScheduleType.GPU_ThreadBlock + ]: + continue + all_symbols = all_symbols | next_map.used_symbols_within_scope(self._node_to_state_cache[next_map]) + + for sdfg in top_sdfg.all_sdfgs_recursive(): + nsdfg_node = sdfg.parent_nsdfg_node + if nsdfg_node is None: + continue + + for symbol in all_symbols: + if str(symbol) not in sdfg.symbols: + sdfg.add_symbol(str(symbol), dace.dtypes.int32) + if str(symbol) not in nsdfg_node.symbol_mapping: + nsdfg_node.symbol_mapping[symbol] = dace.symbol(symbol) + + # Array analysis and metadata functions + def collect_array_descriptor_usage( + self, map_entry: nodes.MapEntry, + array_name: str) -> Set[Tuple[dt.Array, SDFG, FrozenSet[SDFG], FrozenSet[nodes.AccessNode]]]: + """ + Tracks usage of a transient array across nested SDFGs within the scope of a map. + + For each array it collects: + - the outermost SDFG where it is defined or passed through, + - all SDFGs in which it is accessed or passed via connectors, + - all AccessNodes referencing it in those SDFGs. + + Note: By "same array" we mean arrays with the same name and connected via memlets; + multiple descriptor objects (dt.Array) may exist across SDFGs for the same logical array. + + Args: + map_entry: The MapEntry node whose scope is used for analysis. + array_name: The name of the array to analyze. + + Returns: + A set of tuples, each containing: + - one of potentially many dt.Array descriptors, + - the outermost defining or using SDFG, + - a frozenset of all involved SDFGs, + - a frozenset of all AccessNodes using this array. + """ + access_nodes_info: List[Tuple[nodes.AccessNode, SDFGState, + SDFG]] = self.get_access_nodes_within_map(map_entry, array_name) + + last_sdfg: SDFG = self._node_to_sdfg_cache[map_entry] + + result: Set[Tuple[dt.Array, SDFG, Set[SDFG], Set[nodes.AccessNode]]] = set() + visited_sdfgs: Set[SDFG] = set() + + for access_node, state, sdfg in access_nodes_info: + + # Skip visited sdfgs where the array name is defined + if sdfg in visited_sdfgs: + continue + + # Get the array_desc (there may be several copies across SDFG, but + # we are only interested in the information thus this is fine) + array_desc = access_node.desc(state) + + # Collect all sdfgs and access nodes which refer to the same array + # (we determine this by inspecting if the array name is passed via connectors) + sdfg_set: Set[SDFG] = set() + access_nodes_set: Set[nodes.AccessNode] = set() + access_nodes_set.add(access_node) + + # Get all parent SDFGs and the outermost sdfg where defined + current_sdfg = sdfg + outermost_sdfg = current_sdfg + while True: + sdfg_set.add(current_sdfg) + + # We have reached the map's sdfg, so this is the + # outermost_sdfg we consider + if current_sdfg == last_sdfg: + outermost_sdfg = current_sdfg + break + + nsdfg_node = current_sdfg.parent_nsdfg_node + if array_name in nsdfg_node.in_connectors or array_name in nsdfg_node.out_connectors: + current_sdfg = current_sdfg.parent_sdfg + outermost_sdfg = current_sdfg + else: + break + + # Get all child SDFGs where the array was also passed to + queue = [sdfg] + while queue: + current_sdfg = queue.pop(0) + for child_state in current_sdfg.states(): + for node in child_state.nodes(): + if not isinstance(node, nodes.NestedSDFG): + continue + + nsdfg_node = node + if array_name in nsdfg_node.in_connectors or array_name in nsdfg_node.out_connectors: + queue.append(nsdfg_node.sdfg) + sdfg_set.add(nsdfg_node.sdfg) + + # Get all access nodes with the array name used in the sdfgs we found + for current_sdfg in sdfg_set: + for current_state in current_sdfg.states(): + for node in current_state.nodes(): + if isinstance(node, nodes.AccessNode) and node.data == array_name: + access_nodes_set.add(node) + + # Update all visited sdfgs + visited_sdfgs.update(sdfg_set) + + # Finally add information to the result + result.add((array_desc, outermost_sdfg, frozenset(sdfg_set), frozenset(access_nodes_set))) + + return result + + def new_name_required(self, map_entry: nodes.MapEntry, array_name: str, + sdfg_defined: FrozenSet[SDFG]) -> Tuple[bool, str]: + """ + Returns whether the array_name is also used at an SDFG which is not in the sdfg_defined set. + This means that the array_name at that SDFG refers to another data descriptor. + Another new name is suggested if this case occurs. + + Args: + map_entry: The MapEntry node whose scope is used to determine name usage. + array_name: The name of the data descriptor of interest + sdfg_defined: where the data descriptor is defined + + Returns: + A Tuple where first element is indicatin whether a new name is required, and + the other is either the same name if no new name is required or otherwise a new name suggestion. + """ + map_parent_sdfg = self._node_to_sdfg_cache[map_entry] + taken_names = set() + + for sdfg in map_parent_sdfg.all_sdfgs_recursive(): + + # Continue if sdfg is neither the map's parent state + # or not contained within the map scope + nsdfg_node = sdfg.parent_nsdfg_node + state = self._node_to_state_cache[nsdfg_node] if nsdfg_node else None + + if not ((nsdfg_node and state and helpers.contained_in(state, nsdfg_node, map_entry)) + or sdfg is map_parent_sdfg): + continue + + # Taken names are all symbol and array identifiers of sdfgs in which + # the array_name's data descriptor we are interested in IS NOT defined + if sdfg not in sdfg_defined: + taken_names.update(sdfg.arrays.keys()) + taken_names.update(sdfg.used_symbols(True)) + + if array_name in taken_names: + counter = 0 + new_name = f"local_{counter}_{array_name}" + while new_name in taken_names: + counter += 1 + new_name = f"local_{counter}_{array_name}" + + return True, new_name + else: + return False, array_name + + # Utility functions - basic building blocks + def get_access_nodes_within_map(self, map_entry: nodes.MapEntry, + data_name: str) -> List[Tuple[nodes.AccessNode, SDFGState, SDFG]]: + """ + Finds all AccessNodes that refer to the given `data_name` and are located inside + the scope of the specified MapEntry. + + Returns: + A list of tuples, each consisting of: + - the matching AccessNode, + - the SDFGState in which it resides, + - and the parent SDFG containing the node. + """ + starting_sdfg = self._node_to_sdfg_cache[map_entry] + matching_access_nodes = [] + + for node, parent_state in starting_sdfg.all_nodes_recursive(): + + if (isinstance(node, nodes.AccessNode) and node.data == data_name + and helpers.contained_in(parent_state, node, map_entry)): + + parent_sdfg = self._node_to_sdfg_cache[node] + matching_access_nodes.append((node, parent_state, parent_sdfg)) + + return matching_access_nodes + + def get_maps_between(self, stop_map_entry: nodes.MapEntry, + node: nodes.Node) -> Tuple[List[nodes.MapEntry], List[nodes.MapExit]]: + """ + Returns all MapEntry/MapExit pairs between `node` and `stop_map_entry`, inclusive. + + Maps are returned from innermost to outermost, starting at the scope of `node` and + ending at `stop_map_entry`. Assumes that `node` is (directly or indirectly via a + nestedSDFG) contained within the `stop_map_entry`'s scope. + + Args: + stop_map_entry: The outermost MapEntry to stop at (inclusive). + node: The node from which to begin scope traversal. + + Returns: + A tuple of two lists: + - List of MapEntry nodes (from inner to outer scope), + - List of corresponding MapExit nodes. + """ + stop_state = self._node_to_state_cache[stop_map_entry] + stop_exit = stop_state.exit_node(stop_map_entry) + + entries: List[nodes.MapEntry] = [] + exits: List[nodes.MapExit] = [] + + current_state = self._node_to_state_cache[node] + parent_info = helpers.get_parent_map(current_state, node) + + while True: + if parent_info is None: + raise ValueError("Expected node to be in scope of stop_map_entry, but no parent map was found.") + + entry, state = parent_info + exit_node = state.exit_node(entry) + + entries.append(entry) + exits.append(exit_node) + + if exit_node == stop_exit: + break + + parent_info = helpers.get_parent_map(state, entry) + + return entries, exits + + def get_nearest_access_node(self, access_nodes: List[nodes.AccessNode], node: nodes.Node) -> nodes.AccessNode: + """ + Finds the closest access node (by graph distance) to the given node + within the same state. Direction is ignored. + + Args: + access_nodes: List of candidate AccessNodes to search from. + node: The node from which to start the search. + + Returns: + The closest AccessNode (by number of edges traversed). + + Raises: + RuntimeError: If no access node is conected in the node's state to the node. + """ + state = self._node_to_state_cache[node] + + visited = set() + queue = [node] + while queue: + current = queue.pop(0) + if current in access_nodes: + return current + + visited.add(current) + for neighbor in state.neighbors(current): + if neighbor not in visited: + queue.append(neighbor) + + raise RuntimeError(f"No access node found connected to the given node {node}. ") + + def in_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]: + """ + Traces all incoming dataflow paths to the given AccessNode. + Only searches in the same state where the AccessNode is. + + Returns: + A list of edge paths (each a list of edges). + """ + state = self._node_to_state_cache[access_node] + + # Start paths with in-edges to the access node. + initial_paths = [[edge] for edge in state.in_edges(access_node)] + queue = deque(initial_paths) + complete_paths = [] + + while queue: + # Get current path and see whether the starting node has in-edges carrying the access nodes data + current_path = queue.popleft() + first_edge = current_path[0] + current_node = first_edge.src + incoming_edges = [edge for edge in state.in_edges(current_node)] + + # If no incoming edges found, this path is complete + if len(incoming_edges) == 0: + + complete_paths.append(current_path) + continue + + # Otherwise, extend the current path and add it to the queue for further processing + for edge in incoming_edges: + if edge in current_path: + raise ValueError("Unexpected cycle detected") + + extended_path = [edge] + current_path + queue.append(extended_path) + + return complete_paths + + def out_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]: + """ + Traces all outgoing dataflow paths to the given AccessNode. + Only searches in the same state where the AccessNode is. + + Returns: + A list of edge paths (each a list of edges). + """ + state: SDFGState = self._node_to_state_cache[access_node] + + initial_paths = [[edge] for edge in state.out_edges(access_node)] + queue = deque(initial_paths) + complete_paths = [] + + while queue: + # Get current path and see whether the last node has out-edges carrying the access nodes data + current_path = queue.popleft() + last_edge = current_path[-1] + current_node = last_edge.dst + outgoing_edges = [edge for edge in state.out_edges(current_node)] + + # If no such edges found, this path is complete + if len(outgoing_edges) == 0: + complete_paths.append(current_path) + continue + + # Otherwise, extend the current path and add it to the queue for further processing + for edge in outgoing_edges: + + if edge in current_path: + raise ValueError("Unexpected cycle detected") + + extended_path = current_path + [edge] + queue.append(extended_path) + + return complete_paths diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py new file mode 100644 index 0000000000..4f73d41ef9 --- /dev/null +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -0,0 +1,355 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import warnings +from typing import Dict, Set, Tuple + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace.codegen.targets.experimental_cuda_helpers import gpu_utils +from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, NestedSDFG, Node +from dace.sdfg.state import LoopRegion +from dace.transformation import helpers, pass_pipeline as ppl, transformation + + +@properties.make_properties +@transformation.explicit_cf_compatible +class DefaultSharedMemorySync(ppl.Pass): + """ + This pass inserts synchronization tasklets that call "__syncthreads()". + This is for GPUs. + + Synchronization is added after GPU_ThreadBlock (TB) MapExits if the TB map + writes to shared memory or after collaborative writes to shared memory (smem). + + Important notes: + - Calling "__syncthreads()" inside a TB map can lead to deadlocks, + for example when only a subset of threads participates (thread divergence). + Therefore, users must **not** write to shared memory inside a Sequential + map or LoopRegion that is nested within a TB map. + + - If shared memory is still written sequentially within a TB map, the missing + intermediate synchronizations may lead to race conditions and incorrect results. + Because deadlocks are worse than race conditions, this pass avoids inserting + synchronization inside TB maps, but it will warn the user about potential risks. + + - When writing to and reading from shared memory within the same TB map, + users must ensure that no synchronization is required, since barriers + are not inserted automatically in this case (again, to avoid deadlocks). + If synchronization is needed, the computation should instead be split + across sequential TB maps. There is no warning for race conditions in this + case for misbehavior. + + - In nested TB maps (e.g., GPU_Device map -> TB map -> TB map ...), + synchronization is only inserted at the outermost TB map's exit, + again to avoid deadlocks. + """ + + def __init__(self): + """Initialize the synchronization pass.""" + # Cache each node's parent state during apply_pass() + self._node_to_parent_state: Dict[Node, SDFGState] = dict() + + def apply_pass(self, sdfg: SDFG, _) -> None: + """ + Insert synchronization barriers (`__syncthreads()`) where needed to ensure + shared memory writes are synchronied for potential subsequent reads. + + This pass performs the following steps: + 1. Collect all ThreadBlock-scheduled MapExits and candidate collaborative + shared-memory writes (AccessNodes). + 2. Analyze ThreadBlock MapExits for synchronization requirements. + 3. Insert synchronization barriers after both MapExits and collaborative + shared-memory writes as needed. + """ + + # 1. Find all GPU_ThreadBlock-scheduled Maps and all collaborative writes to + # GPU shared memory, and cache each node's parent state for convenience. + tb_map_exits: Dict[MapExit, SDFGState] = dict() + collaborative_smem_copies: Dict[AccessNode, SDFGState] = dict() + for node, parent_state in sdfg.all_nodes_recursive(): + self._node_to_parent_state[node] = parent_state + if isinstance(node, MapExit) and node.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + tb_map_exits[node] = parent_state + elif isinstance(node, AccessNode) and self.is_collaborative_smem_write(node, parent_state): + collaborative_smem_copies[node] = parent_state + + + # 2. Identify TB MapExits requiring a synchronization barrier + sync_requiring_exits = self.identify_synchronization_tb_exits(tb_map_exits) + + # 3. Insert synchronization barriers for previous TB MapExits + self.insert_synchronization_after_nodes(sync_requiring_exits) + + # 4. Insert synchronization after collaborative shared memory writes + self.insert_synchronization_after_nodes(collaborative_smem_copies) + + def is_collaborative_smem_write(self, node: AccessNode, state: SDFGState) -> bool: + """ + Determine whether the given AccessNode corresponds to a collaborative + shared-memory (smem) write, i.e., whether it is written cooperatively + by GPU threads at the device level but not within a thread block map. + + Parameters + ---------- + node : AccessNode + The candidate access node. + state : SDFGState + The state in which the node resides. + + Returns + ------- + bool + True if the node is a collaborative smem write, False otherwise. + """ + # 1. node is not stored in shared memory - skip + if node.desc(state).storage != dtypes.StorageType.GPU_Shared: + return False + + # 2. To my knowledge, it is not a collaborative write if the result comes from a ThreadBlock map. + if all(isinstance(pred, MapExit) and pred.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock + for pred in state.predecessors(node)): + return False + + # 3. If all in edges are empty, there is no write - and no sync necessary + if all(edge.data.is_empty() for edge in state.in_edges(node)): + return False + + # 4. It is a collaborative copy if it is within a kernel but not within a GPU_ThreadBlock map + if (not gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_Device]) + or gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_ThreadBlock])): + return False + + return True + + def identify_synchronization_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> Dict[MapExit, SDFGState]: + """ + Identify ThreadBlock exits after which "__syncthreads()" should be called. + + Parameters + ---------- + tb_map_exits : Dict[MapExit, SDFGState] + Mapping from GPU_ThreadBlock - scheduled MapExit nodes to their parent SDFGState. + + Returns + ------- + Dict[MapExit, SDFGState] + Subset of `tb_map_exits` where any AccessNode between the entry and exit + uses GPU shared memory, indicating a synchronization barrier is needed. + """ + #------------------------- helper function ------------------------- + sync_requiring_exits: Dict[MapExit, SDFGState] = {} + + for map_exit, state in tb_map_exits.items(): + + # process + map_entry = state.entry_node(map_exit) + writes_to_smem, race_cond_danger, has_tb_parent = self.tb_exits_analysis(map_entry, map_exit, state) + + # Skip: if this TB map is nested inside another TB map in the same kernel + # (i.e., before reaching the GPU_Device map), synchronization responsibility belongs + # to the outermost such TB map in the kernel. + if has_tb_parent: + continue + + # Warn user: potential race condition detected. + elif race_cond_danger and writes_to_smem: + warnings.warn( + f"Race condition danger: LoopRegion or Sequential Map inside ThreadBlock map {map_entry} " + "writes to GPU shared memory. No synchronization occurs for intermediate steps, " + "because '__syncthreads()' is only called outside the ThreadBlock map to avoid potential deadlocks." + "Please consider moving the LoopRegion or Sequential Map outside the ThreadBlock map.") + sync_requiring_exits[map_exit] = state + + # TB map writes to shared memory: synchronization is needed + elif writes_to_smem: + sync_requiring_exits[map_exit] = state + + return sync_requiring_exits + + def tb_exits_analysis(self, map_entry: MapEntry, map_exit: MapExit, state: SDFGState) -> Tuple[bool, bool, bool]: + """ + Analyze a GPU_ThreadBlock-scheduled map to determine: + - whether it writes to shared memory, + - whether such writes may cause race conditions, and + - whether it is nested within another GPU_ThreadBlock map inside the kernel. + + Returns a tuple of three booleans: + + 1. `writes_to_shared_memory`: + True if the map writes to GPU shared memory. This includes writes + directly at the MapExit or within the map scope. + + 2. `race_cond_danger`: + True if there is a potential race condition due to shared memory writes + inside either: + - a sequentially scheduled map, or + - a loop region. + (Note: single-iteration loops/sequential maps are not treated differently; + they are still marked as dangerous, even though they cannot cause races.) + + 3. `has_parent_tb_map`: + True if this ThreadBlock map is nested inside another ThreadBlock map + (i.e., there exists another TB map between the enclosing GPU_Device + map and the current TB map). + + Parameters + ---------- + map_entry : MapEntry + The entry node of the ThreadBlock map. + map_exit : MapExit + The exit node of the ThreadBlock map. + state : SDFGState + The parent state containing the map. + + Returns + ------- + Tuple[bool, bool, bool] + A tuple: + `(writes_to_shared_memory, race_cond_danger, has_parent_tb_map)` + """ + # Initially, the flags are all set to False + writes_to_shared_memory = False + race_cond_danger = False + has_parent_tb_map = False + + # 1. Check if the ThreadBlock (TB) map writes to shared memory + for edge in state.out_edges(map_exit): + is_smem: bool = (isinstance(edge.dst, AccessNode) + and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared) + if is_smem and not edge.data.is_empty(): + writes_to_shared_memory = True + break + + # 2. Search between map entry and exit: + # - Detect writes to shared memory (unless already found) + # - Collect nested SDFGs for later analysis + nested_sdfgs: Set[NestedSDFG] = set() + + for node in state.all_nodes_between(map_entry, map_exit): + if not writes_to_shared_memory and isinstance(node, AccessNode): + # Check if this AccessNode writes to shared memory + if (node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node))): + writes_to_shared_memory = True + + elif isinstance(node, NestedSDFG): + nested_sdfgs.add(node) + + # 3. Recursively analyze nested SDFGs: + # - Detect shared memory writes (only if not already found) + # - Check for potential race conditions in loop regions (only if not already flagged) + for nsdfg in nested_sdfgs: + subs_sdfg = nsdfg.sdfg + if not writes_to_shared_memory: + writes_to_shared_memory = self.sdfg_writes_to_smem(subs_sdfg) + + if not race_cond_danger: + race_cond_danger = self.writes_to_smem_inside_loopregion(subs_sdfg) + + # 4. Check for race condition danger in sequential maps that use shared memory + # (only if not already flagged) + if not race_cond_danger: + race_cond_danger = any( + inner_scope.map.schedule == dtypes.ScheduleType.Sequential and self.map_writes_to_smem(inner_scope) + for _, inner_scope in helpers.get_internal_scopes(state, map_entry)) + + # 5. Check if this TB map is nested within another TB map + parent = helpers.get_parent_map(state, map_entry) + + while parent: + parent_map, parent_state = parent + if parent_map.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + has_parent_tb_map = True + break + if parent_map.map.schedule == dtypes.ScheduleType.GPU_Device: + break + parent = helpers.get_parent_map(parent_state, parent_map) + + # 6. Return the results + return writes_to_shared_memory, race_cond_danger, has_parent_tb_map + + def writes_to_smem_inside_loopregion(self, sdfg: SDFG) -> bool: + """ + Return True if the SDFG writes to GPU shared memory (smem) inside + a LoopRegion. This check is recursive and includes nested SDFGs. + """ + for node in sdfg.nodes(): + if isinstance(node, LoopRegion): + # Traverse all nodes inside the loop region + for subnode, parent in node.all_nodes_recursive(): + if (isinstance(subnode, AccessNode) + and subnode.desc(parent).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in parent.in_edges(node))): + return True + + elif isinstance(node, NestedSDFG): + # Recurse into nested SDFGs + if self.writes_to_smem_inside_loopregion(node.sdfg): + return True + + return False + + def sdfg_writes_to_smem(self, sdfg: SDFG) -> bool: + """ + Return True if the SDFG writes to GPU shared memory (smem), + i.e., contains an AccessNode with GPU_Shared storage that has + at least one non-empty incoming edge. + """ + for node, state in sdfg.all_nodes_recursive(): + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node))): + return True + return False + + def map_writes_to_smem(self, map_entry: MapEntry) -> bool: + """ + Return True if the map writes to GPU shared memory (smem). + + A map is considered to write to smem if: + - Any AccessNode with GPU_Shared storage is written to at the MapExit, or + - Such writes occur within the map scope, or + - A nested SDFG within the map writes to smem. + """ + state = self._node_to_parent_state[map_entry] + map_exit = state.exit_node(map_entry) + + # 1. Check if MapExit writes directly to shared memory + for edge in state.out_edges(map_exit): + if (isinstance(edge.dst, AccessNode) and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared + and not edge.data.is_empty()): + return True + + # 2. Inspect nodes inside the map scope + for node in state.all_nodes_between(map_entry, map_exit): + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node))): + return True + + if isinstance(node, NestedSDFG) and self.sdfg_writes_to_smem(node.sdfg): + return True + + # No writes to shared memory found + return False + + def insert_synchronization_after_nodes(self, nodes: Dict[Node, SDFGState]) -> None: + """ + Insert synchronization tasklets (calling `__syncthreads()`) after the given + GPU-related nodes. + + Parameters + ---------- + nodes : Dict[Node, SDFGState] + Mapping from SDFG nodes to their parent states after which a + synchronization tasklet should be inserted. + """ + for node, state in nodes.items(): + + sync_tasklet = state.add_tasklet(name="sync_threads", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP) + + for succ in state.successors(node): + state.add_edge(sync_tasklet, None, succ, None, dace.Memlet()) + + state.add_edge(node, None, sync_tasklet, None, dace.Memlet()) diff --git a/tests/codegen/cuda_mempool_test.py b/tests/codegen/cuda_mempool_test.py index eccd97ee61..128634720c 100644 --- a/tests/codegen/cuda_mempool_test.py +++ b/tests/codegen/cuda_mempool_test.py @@ -144,7 +144,8 @@ def tester(A: CudaArray, B: CudaArray): code = sdfg.generate_code()[0].clean_code assert code.count('cudaMallocAsync') == 1 - assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1 + assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1 or code.count( + 'cudaFreeAsync(pooled, gpu_stream0') == 1 # Test code import cupy as cp @@ -198,7 +199,8 @@ def test_memory_pool_if_states(cnd): sdfg.validate() code = sdfg.generate_code()[0].clean_code assert code.count('cudaMallocAsync') == 1 - assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1 + assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1 or code.count( + f'cudaFreeAsync({tmp}, gpu_stream0') == 1 # Test code import cupy as cp diff --git a/tests/codegen/gpu_memcpy_test.py b/tests/codegen/gpu_memcpy_test.py index c7a3525f95..1cc650ffaa 100644 --- a/tests/codegen/gpu_memcpy_test.py +++ b/tests/codegen/gpu_memcpy_test.py @@ -15,11 +15,14 @@ rng = cp.random.default_rng(42) -def count_node(sdfg: dace.SDFG, node_type): +def count_node(sdfg: dace.SDFG, node_type, ignore_gpustream_nodes=True): nb_nodes = 0 for rsdfg in sdfg.all_sdfgs_recursive(): for state in sdfg.states(): for node in state.nodes(): + if (ignore_gpustream_nodes and isinstance(node, dace_nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t): + continue if isinstance(node, node_type): nb_nodes += 1 return nb_nodes diff --git a/tests/codegen/nested_kernel_transient_test.py b/tests/codegen/nested_kernel_transient_test.py index 54488a3aac..d4c3182c16 100644 --- a/tests/codegen/nested_kernel_transient_test.py +++ b/tests/codegen/nested_kernel_transient_test.py @@ -24,7 +24,15 @@ def nested(A: dace.float64[128, 64]): state.add_edge(n, 'A', w, None, dace.Memlet('A')) if persistent: - sdfg.arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent + arrays = sdfg.cfg_list[-1].arrays + if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental': + # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays + # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the + # form local_{counter}_gpuA + target_name = next(k for k in arrays if "gpu_A" in k) + else: + target_name = "gpu_A" + arrays[target_name].lifetime = dace.AllocationLifetime.Persistent a = np.random.rand(128, 64) expected = np.copy(a) @@ -50,7 +58,15 @@ def transient(A: dace.float64[128, 64]): sdfg.apply_gpu_transformations() if persistent: - sdfg.cfg_list[-1].arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent + arrays = sdfg.cfg_list[-1].arrays + if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental': + # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays + # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the + # form local_{counter}_gpuA + target_name = next(k for k in arrays if "gpu_A" in k) + else: + target_name = "gpu_A" + arrays[target_name].lifetime = dace.AllocationLifetime.Persistent a = np.random.rand(128, 64) expected = np.copy(a) @@ -87,7 +103,15 @@ def transient(A: dace.float64[128, 64]): sdfg.apply_gpu_transformations() if persistent: - sdfg.cfg_list[-1].arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent + arrays = sdfg.cfg_list[-1].arrays + if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental': + # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays + # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the + # form local_{counter}_gpuA + target_name = next(k for k in arrays if "gpu_A" in k) + else: + target_name = "gpu_A" + arrays[target_name].lifetime = dace.AllocationLifetime.Persistent a = np.random.rand(128, 64) expected = np.copy(a) diff --git a/tests/cuda_block_test.py b/tests/cuda_block_test.py index 8b75376a00..74ee21fd90 100644 --- a/tests/cuda_block_test.py +++ b/tests/cuda_block_test.py @@ -181,6 +181,7 @@ def tester(A: dace.float64[200]): tasklet.location['gpu_block'] = 1 code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + sdfg.compile() assert '>= 2' in code and '<= 8' in code assert ' == 1' in code diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py index f8553249ea..2a64cd2255 100644 --- a/tests/parse_state_struct_test.py +++ b/tests/parse_state_struct_test.py @@ -10,7 +10,7 @@ import dace import dace.library -from dace import dtypes +from dace import dtypes, Config from dace.codegen import codeobject, targets, compiler, compiled_sdfg, common @@ -31,9 +31,14 @@ def _cuda_helper(): }} }} """ - program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") - dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy") + if Config.get('compiler', 'cuda', 'implementation') == 'experimental': + program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") + dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.experimental_cuda.ExperimentalCUDACodeGen, + "CudaDummy") + else: + program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") + dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy") build_folder = dace.Config.get('default_build_folder') BUILD_PATH = os.path.join(build_folder, "cuda_helper") diff --git a/tests/passes/gpu_specialization/gpu_stream_test.py b/tests/passes/gpu_specialization/gpu_stream_test.py new file mode 100644 index 0000000000..07d1facdf9 --- /dev/null +++ b/tests/passes/gpu_specialization/gpu_stream_test.py @@ -0,0 +1,116 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import pytest + +import dace +from dace.codegen import common +from dace.transformation.pass_pipeline import Pipeline +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_copy_tasklet import InsertGPUCopyTasklets +from dace.transformation.passes.gpu_specialization.gpu_stream_topology_simplification import GPUStreamTopologySimplification + +gpu_stream_pipeline = Pipeline([ + NaiveGPUStreamScheduler(), + InsertGPUStreamsToSDFGs(), + ConnectGPUStreamsToKernels(), + ConnectGPUStreamsToTasklets(), + InsertGPUStreamSyncTasklets(), + InsertGPUCopyTasklets(), + GPUStreamTopologySimplification(), +]) + +backend = common.get_gpu_backend() + + +@pytest.mark.gpu +def test_basic(): + """ + A simple memory copy program. + + Since the SDFG has a single connected component, exactly one GPU stream is used + and must be synchronized at the end of the state. For each synchronized stream, + the pipeline introduces a memlet from the synchronization tasklet to a GPU stream + AccessNode. Therefore, it is sufficient to verify there is only one sink node with one ingoing + edge, verify its dtype, and check for the presence of a preceeding synchronization tasklet. + """ + + @dace.program + def simple_copy(A: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:128:1] @ dace.dtypes.ScheduleType.GPU_Device: + B[i] = A[i] + + sdfg = simple_copy.to_sdfg() + gpu_stream_pipeline.apply_pass(sdfg, {}) + + state = sdfg.states()[0] + sink_nodes = state.sink_nodes() + node = sink_nodes[0] + assert ( + len(sink_nodes) == 1 and len(state.in_edges(node)) == 1 and isinstance(node, dace.nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t + ), ("Only one sink node with should exist, which is a GPU stream AccessNode and it should have one ingoing edge.") + + assert (isinstance(pre, dace.nodes.Tasklet) and f"{backend}StreamSynchronize(" in pre.code.as_string + for pre in state.predecessors(node)), ("At then end of each state any used stream must be synchronized.") + + +@pytest.mark.gpu +def test_extended(): + """ + A program that performs two independent memory copies. + + The input arrays reside in host memory, and `gpu_transformations()` is applied to + the program. As a result, the data is first copied to GPU global memory, after + which the two copies are executed on the GPU. Since these copies form two + independent connected components in the resulting SDFG, the naive GPU stream + scheduler assigns them to different GPU streams. + + This test verifies that exactly two GPU streams are used, that both streams are + synchronized at the end of the state, and that the corresponding asynchronous + memory copy tasklets are correctly associated with their assigned streams. + """ + + @dace.program + def independent_copies(A: dace.uint32[128], B: dace.uint32[128], C: dace.uint32[128], D: dace.uint32[128]): + for i in dace.map[0:128:1]: + B[i] = A[i] + for i in dace.map[0:128:1]: + D[i] = C[i] + + sdfg = independent_copies.to_sdfg() + + # Transform such that program can run on GPU and apply GPU stream pipeline + sdfg.apply_gpu_transformations() + gpu_stream_pipeline.apply_pass(sdfg, {}) + + # Test 1: Two GPU streams were used since we use the Naive Stream scheduler + state = sdfg.states()[0] + sink_nodes = state.sink_nodes() + node = sink_nodes[0] + assert (len(sink_nodes) == 1 and len(state.in_edges(node)) == 2 and isinstance(node, dace.nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t), ( + "Only one sink node with should exist, which is a GPU stream AccessNode and it " + "should have two ingoing edges as original graph consisted of two connected components.") + + # Test 2: We synchronize at the end of the state + assert (isinstance(pre, dace.nodes.Tasklet) and f"{backend}StreamSynchronize(" in pre.code.as_string + for pre in state.predecessors(node)), ("At then end of each state any used stream must be synchronized.") + + # Test 3: Check that we have memory copy tasklets (as we perform two "Main Memory -> GPU GLobal" + # memory copies and two "GPU Global -> Main Memory" memory copies by applying the gpu tranformation) + # and that they use the name of the in connector of the GPU stream in the copy call + memcopy_tasklets = [ + n for n in state.nodes() if isinstance(n, dace.nodes.Tasklet) and f"{backend}MemcpyAsync(" in n.code.as_string + ] + for tasklet in memcopy_tasklets: + assert len(tasklet.in_connectors) == 1, ("Memcpy tasklets must have exactly one input connector " + "corresponding to the GPU stream.") + + in_connector = next(iter(tasklet.in_connectors)) + + assert in_connector in tasklet.code.as_string, ( + "Memcpy tasklets must reference their GPU stream input connector in the memcpy call.")