diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index c7ea6258a6..5c112d1e89 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -807,9 +807,12 @@ def unparse_cr(sdfg, wcr_ast, dtype): def connected_to_gpu_memory(node: nodes.Node, state: SDFGState, sdfg: SDFG): for e in state.all_edges(node): path = state.memlet_path(e) - if ((isinstance(path[0].src, nodes.AccessNode) - and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)): + if (((isinstance(path[0].src, nodes.AccessNode) + and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)) + or ((isinstance(path[-1].dst, nodes.AccessNode) + and path[-1].dst.desc(sdfg).storage is dtypes.StorageType.GPU_Global))): return True + return False diff --git a/dace/dtypes.py b/dace/dtypes.py index c2835d85a4..190d078d0f 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -71,6 +71,7 @@ class ScheduleType(AutoNumberEnum): GPU_ThreadBlock = () #: Thread-block code GPU_ThreadBlock_Dynamic = () #: Allows rescheduling work within a block GPU_Persistent = () + GPU_Warp = () Snitch = () Snitch_Multicore = () @@ -84,6 +85,11 @@ class ScheduleType(AutoNumberEnum): ScheduleType.GPU_Persistent, ] +# A subset of GPU schedule types for ExperimentalCUDACodeGen +EXPERIMENTAL_GPU_SCHEDULES = [ + ScheduleType.GPU_Warp, +] + # A subset of CPU schedule types CPU_SCHEDULES = [ ScheduleType.CPU_Multicore, @@ -95,6 +101,8 @@ class ScheduleType(AutoNumberEnum): StorageType.GPU_Shared, ] +GPU_KERNEL_ACCESSIBLE_STORAGES = [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.Register] + @undefined_safe_enum class ReductionType(AutoNumberEnum): @@ -192,7 +200,8 @@ class TilingType(AutoNumberEnum): ScheduleType.GPU_ThreadBlock: StorageType.Register, ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register, ScheduleType.SVE_Map: StorageType.CPU_Heap, - ScheduleType.Snitch: StorageType.Snitch_TCDM + ScheduleType.Snitch: StorageType.Snitch_TCDM, + ScheduleType.GPU_Warp: StorageType.Register, } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -207,9 +216,10 @@ class TilingType(AutoNumberEnum): ScheduleType.GPU_Device: ScheduleType.GPU_ThreadBlock, ScheduleType.GPU_ThreadBlock: ScheduleType.Sequential, ScheduleType.GPU_ThreadBlock_Dynamic: ScheduleType.Sequential, + ScheduleType.GPU_Warp: ScheduleType.Sequential, ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, - ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore + ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. @@ -1240,6 +1250,7 @@ class string(_DaCeArray, npt.NDArray[numpy.str_]): ... class vector(_DaCeArray, npt.NDArray[numpy.void]): ... class MPI_Request(_DaCeArray, npt.NDArray[numpy.void]): ... class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ... + class gpuStream_t(_DaCeArray, npt.NDArray[numpy.void]): ... # yapf: enable else: # Runtime definitions @@ -1260,6 +1271,7 @@ class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ... complex128 = typeclass(numpy.complex128) string = stringtype() MPI_Request = opaque('MPI_Request') + gpuStream_t = opaque('gpuStream_t') float32sr = Float32sr() @@ -1281,6 +1293,7 @@ class Typeclasses(AutoNumberEnum): float64 = float64 complex64 = complex64 complex128 = complex128 + gpuStream_t = gpuStream_t _bool = bool @@ -1508,6 +1521,7 @@ def can_access(schedule: ScheduleType, storage: StorageType): ScheduleType.GPU_Persistent, ScheduleType.GPU_ThreadBlock, ScheduleType.GPU_ThreadBlock_Dynamic, + ScheduleType.GPU_Warp, ]: return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned] elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]: diff --git a/dace/transformation/passes/gpu_specialization/__init__.py b/dace/transformation/passes/gpu_specialization/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dace/transformation/passes/gpu_specialization/helpers/__init__.py b/dace/transformation/passes/gpu_specialization/helpers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py new file mode 100644 index 0000000000..1c22db4bd2 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py @@ -0,0 +1,525 @@ +# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved. +from abc import ABC, abstractmethod +from typing import Any, Dict, Tuple, Union +from dace import SDFG, SDFGState, data, dtypes, subsets +from dace import memlet as mm +from dace.codegen import common +from dace.codegen.targets import cpp +from dace.codegen.targets.cpp import sym2cpp +from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import generate_sync_debug_call +from dace.dtypes import StorageType +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import helpers + + +class CopyContext: + """ + Encapsulates inputs required for copy operations and exposes helper + methods to derive additional information. This keeps copy strategies + lightweight by letting them focus only on the relevant logic. + """ + + def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[mm.Memlet]): + + # Store the basic context as attributes + self.sdfg = sdfg + self.state = state + self.src_node = src_node + self.dst_node = dst_node + self.edge = edge + + memlet = edge.data + + self.copy_shape = memlet.subset.size_exact() + if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): + copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info() + else: + copy_shape = memlet.subset.size_exact() + src_strides = dst_strides = src_expr = dst_expr = None + + self.copy_shape = copy_shape + self.src_strides = src_strides + self.dst_strides = dst_strides + self.src_expr = src_expr + self.dst_expr = dst_expr + + def get_storage_type(self, node: nodes.Node): + """ + Return the storage type associated with a given SDFG node. + + Tasklets are assumed to use register storage, while AccessNodes + return the storage type from their data descriptor. Raises + NotImplementedError for unsupported node types. + """ + if isinstance(node, nodes.Tasklet): + storage_type = StorageType.Register + + elif isinstance(node, nodes.AccessNode): + storage_type = node.desc(self.sdfg).storage + + else: + raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; " + "expected AccessNode or Tasklet. Please extend this method accordingly.") + + return storage_type + + def get_assigned_gpustream(self) -> str: + """ + Return the GPU stream expression assigned to both source and destination nodes. + Defaults to `__dace_current_stream` placeholder, which can be changed by the scheduling pass + """ + # 2. Generate GPU stream expression + gpustream = "__dace_current_stream" + gpustream_expr = gpustream + + return gpustream_expr + + def get_memory_location(self) -> Tuple[str, str]: + """ + Determine whether the source and destination nodes reside in device or host memory. + + Uses the storage type of each node to classify it as either 'Device' + (GPU global memory) or 'Host' (all other storage types). + Used for GPU related copies outside the kernel (e.g. to construct + cudaMemcpyHostToDevice for example). + + Returns + ------- + Tuple[str, str] + (src_location, dst_location) where each is either 'Device' or 'Host'. + """ + src_storage = self.get_storage_type(self.src_node) + dst_storage = self.get_storage_type(self.dst_node) + src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + return src_location, dst_location + + def get_ctype(self) -> Any: + """ + Determine the C data type (ctype) of the source or destination node. + + The ctype is resolved from the data descriptor of the first node + (source or destination) that is an AccessNode (assumed to be the same + if both are AccessNodes). + + Returns + ------- + Any + The C type string (e.g., "float*", "int32") associated with the node. + + Raises + ------ + NotImplementedError + If neither the source nor the destination node is an AccessNode. + """ + sdfg = self.sdfg + src_node, dst_node = self.src_node, self.dst_node + + if isinstance(src_node, nodes.AccessNode): + return src_node.desc(sdfg).ctype + + if isinstance(dst_node, nodes.AccessNode): + return dst_node.desc(sdfg).ctype + + raise NotImplementedError( + f"Cannot determine ctype: neither src nor dst node is an AccessNode. " + f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. " + "Please extend this case or fix the issue.") + + def get_accessnode_to_accessnode_copy_info(self): + """ + Compute copy shape, absolute strides, and pointer expressions for a copy + between two AccessNodes. Tries to mimic + cpp.memlet_copy_to_absolute_strides without requiring a dispatcher. + + Returns + ------- + (copy_shape, src_strides, dst_strides, src_expr, dst_expr) + + Raises + ------ + TypeError + If either endpoint is not an AccessNode. + NotImplementedError + If a descriptor is not Scalar or Array. + """ + + # ---------------------------- helpers ---------------------------- + def _collapse_strides(strides, subset): + """Remove size-1 dims; keep tile strides; default to [1] if none remain.""" + n = len(subset) + collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1] + collapsed.extend(strides[n:]) # include tiles + if len(collapsed) == 0: + return [1] + return collapsed + + def _ptr_name(desc, name): + if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External): + return f'__state->__{sdfg.cfg_id}_{name}' + return name + + def _expr_for(desc, name, subset): + ptr = _ptr_name(desc, name) + + if isinstance(desc, data.Scalar): + # GPU scalar special-case + if desc.storage in dtypes.GPU_STORAGES: + parent = state.sdfg.parent_nsdfg_node + if parent is not None and name in parent.in_connectors: + return f"&{ptr}" + return ptr + # CPU (or other) scalars + return f"&{ptr}" + + if isinstance(desc, data.Array): + offset = cpp.cpp_offset_expr(desc, subset) + return f"{ptr} + {offset}" if offset != "0" else ptr + + raise NotImplementedError( + f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.") + + # ---------------------------- Get copy info ---------------------------- + # Get needed information + src_node, dst_node = self.src_node, self.dst_node + sdfg, edge, state = self.sdfg, self.edge, self.state + memlet, copy_shape = self.edge.data, self.copy_shape + + # Guard - only applicable if src and dst are AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + raise TypeError( + f"get_accessnode_to_accessnode_copy_info requires both source and destination " + f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.") + + # Get node descriptors + src_nodedesc = src_node.desc(sdfg) + dst_nodedesc = dst_node.desc(sdfg) + + # Resolve subsets (fallback to full range) + src_subset = memlet.get_src_subset(edge, state) + dst_subset = memlet.get_dst_subset(edge, state) + + if src_subset is None: + src_subset = subsets.Range.from_array(src_nodedesc) + + if dst_subset is None: + dst_subset = src_subset + # dst_subset = subsets.Range.from_array(dst_nodedesc) + + # Get strides + src_strides = src_subset.absolute_strides(src_nodedesc.strides) + dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) + + # Try to convert to a degenerate/strided ND copy first + result = cpp.ndcopy_to_strided_copy( + copy_shape, + src_nodedesc.shape, + src_strides, + dst_nodedesc.shape, + dst_strides, + memlet.subset, + src_subset, + dst_subset, + ) + + if result is not None: + copy_shape, src_strides, dst_strides = result + else: + src_strides = _collapse_strides(src_strides, src_subset) + dst_strides = _collapse_strides(dst_strides, dst_subset) + copy_shape = [s for s in copy_shape if s != 1] or [1] + + # Extend copy shape to the largest among the data dimensions, + # and extend other array with the appropriate strides + if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape): + if memlet.data == src_node.data: + copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape) + elif memlet.data == dst_node.data: + copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + + return copy_shape, src_strides, dst_strides, src_node.data, dst_node.data + + +class CopyStrategy(ABC): + """Abstract base class for memory copy strategies.""" + + @abstractmethod + def applicable(self, copy_context: CopyContext) -> bool: + """ + Return True if this strategy can handle the given memory copy. + """ + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate_copy(self, copy_context: CopyContext) -> str: + """ + Generates and returns the copy code for the supported pattern. + """ + raise NotImplementedError('Abstract class') + + +class OutOfKernelCopyStrategy(CopyStrategy): + """ + Copy strategy for memory transfers that occur outside of kernel execution. + + This pattern often occurs when generating host-to-device copies for kernel inputs + (since kernels cannot access host memory directly), and device-to-host copies + to retrieve results for further processing. + """ + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Determines whether the data movement is a host<->device memory copy. + + This function returns True if: + - We are not currently generating kernel code + - The copy occurs between two AccessNodes + - The data descriptors of source and destination are not views. + - The storage types of either src or dst is CPU_Pinned or GPU_Device + - We do not have a CPU-to-CPU copy + """ + # Retrieve needed information + state = copy_context.state + src_node, dst_node = copy_context.src_node, copy_context.dst_node + + # 1. Ensure copy is not occuring within a kernel + scope_dict = state.scope_dict() + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + + parent_map_tuple = helpers.get_parent_map(state, deeper_node) + while parent_map_tuple is not None: + parent_map, parent_state = parent_map_tuple + if parent_map.map.schedule in dtypes.GPU_SCHEDULES + dtypes.EXPERIMENTAL_GPU_SCHEDULES: + return False + else: + parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) + + # 2. Check whether copy is between two AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False + + # 3. The data descriptors of source and destination are not views + if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): + return False + + # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) + or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): + return False + + # 5. Check that this is not a CPU to CPU copy + cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] + if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: + return False + + return True + + def generate_copy(self, copy_context: CopyContext) -> str: + """Execute host-device copy with CUDA memory operations""" + + # Guard + memlet = copy_context.edge.data + if memlet.wcr is not None: + src_location, dst_location = copy_context.get_memory_location() + raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') + + # Based on the copy dimension, call appropiate helper function + num_dims = len(copy_context.copy_shape) + if num_dims == 1: + copy_call = self._generate_1d_copy(copy_context) + + elif num_dims == 2: + copy_call = self._generate_2d_copy(copy_context) + + else: + # sanity check + assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." + copy_call = self._generate_nd_copy(copy_context) + + return copy_call + + def _generate_1d_copy(self, copy_context: CopyContext) -> str: + """ + Generates a 1D memory copy between host and device using the GPU backend. + + Uses {backend}MemcpyAsync for contiguous memory. For strided memory, + {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension. + """ + # ----------- Retrieve relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call -------------------- + + if is_contiguous_copy: + # Memory is linear: can use {backend}MemcpyAsync + copysize = ' * '.join(sym2cpp(copy_shape)) + copysize += f' * sizeof({ctype})' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + call = f'DACE_GPU_CHECK({backend}MemcpyAsync(_out_{dst_expr}, _in_{src_expr}, {copysize}, {kind}, {gpustream}));\n' + + else: + # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch + # This allows copying a strided 1D region + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Potentially synchronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call + + def _generate_2d_copy(self, copy_context: CopyContext) -> None: + """ + Generates a 2D memory copy using {backend}Memcpy2DAsync. + + Three main cases are handled: + - Copy between row-major stored arrays with contiguous rows. + - Copy between column-major stored arrays with contiguous columns. + - A special case where a 2D copy can still be represented. + + Raises: + NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns. + Such cases indicate an unsupported 2D copy and should be examined separately. + They can be implemented if valid, or a more descriptive error should be raised if the path should not occur. + + Note: + {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column), + but not both simultaneously. + """ + + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call if supported -------------------- + # Case: Row-major layout, rows are not strided. + if (src_strides[1] == 1) and (dst_strides[1] == 1): + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[0])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Case: Column-major layout, no columns are strided. + elif (src_strides[0] == 1) and (dst_strides[0] == 1): + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[1])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Special case + elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]): + # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with + # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a + # {backend}Memcpy2DAsync call! + + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0] * copy_shape[1]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + else: + raise NotImplementedError( + f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." + "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." + ) + + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call + + def _generate_nd_copy(self, copy_context: CopyContext) -> None: + """ + Generates GPU code for copying N-dimensional arrays using 2D memory copies. + + Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops + for any outer dimensions. Expects the copy to be contiguous and between + row-major storage locations. + """ + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + num_dims = len(copy_shape) + + # ----------- Guard for unsupported Pattern -------------- + if not (src_strides[-1] == 1) and (dst_strides[-1] == 1): + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + raise NotImplementedError( + "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n" + f" Source node: {src_node} (storage: {src_storage})\n" + f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" + f" Source strides: {src_strides}\n" + f" Destination strides: {dst_strides}\n" + f" copy shape: {copy_shape}\n") + + # ----------------- Generate and write backend call(s) -------------------- + + call = "" + # Write for-loop headers + for dim in range(num_dims - 2): + call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" + + # Write Memcopy2DAsync + offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2])) + + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' + + dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})' + height = sym2cpp(copy_shape[-2]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + # Generate call and write it + call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst}, {dpitch}, _in_{src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Potentially synchronization required if syncdebug is set to true in configurations + call += generate_sync_debug_call() + + # Write for-loop footers + for dim in range(num_dims - 2): + call += "\n}" + + # Return the code + return call diff --git a/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py b/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py new file mode 100644 index 0000000000..be9d510602 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py @@ -0,0 +1,35 @@ +# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved. +from dace import Config +from dace.codegen import common + + +def get_gpu_stream_array_name() -> str: + return "gpu_streams" + + +def get_gpu_stream_connector_name() -> str: + return "__stream_" + + +def generate_sync_debug_call() -> str: + """ + Generate backend sync and error-check calls as a string if + synchronous debugging is enabled. + + Parameters + ---------- + backend : str + Backend API prefix (e.g., 'cuda'). + + Returns + ------- + str + The generated debug call code, or an empty string if debugging is disabled. + """ + backend: str = common.get_gpu_backend() + sync_call: str = "" + if Config.get_bool('compiler', 'cuda', 'syncdebug'): + sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") + + return sync_call diff --git a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py new file mode 100644 index 0000000000..34cd37de4a --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py @@ -0,0 +1,254 @@ +# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace.transformation.passes.gpu_specialization.helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy +from dace.sdfg.graph import Edge, MultiConnectorEdge +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import get_gpu_stream_connector_name + + +def create_viewed_copy_kernel(parent_state: dace.SDFGState, src_node: dace.nodes.AccessNode, + dst_node: dace.nodes.AccessNode, edge: Edge[dace.Memlet]) -> dace.SDFG: + # Currently only 1D and 2D copies are supported + map_ranges = dict() + for i, dim in enumerate(edge.data.subset): + map_ranges[f"i{i}"] = f"0:{dim[1]+1-dim[0]}:{dim[2]}" + + access_expr = ",".join(f"i{i}" for i in range(len(edge.data.subset))) + + src_desc = parent_state.sdfg.arrays[src_node.data] + dst_desc = parent_state.sdfg.arrays[dst_node.data] + + # Add new arrays for the copy SDFG + # Determine src and dst subsets + src_subset = edge.data.subset if edge.data.data == src_node.data else edge.data.other_subset + dst_subset = edge.data.other_subset if edge.data.data == src_node.data else edge.data.subset + + # Collect the new shapes + src_shape = [e + 1 - b for b, e, s in src_subset] + dst_shape = [e + 1 - b for b, e, s in dst_subset] + + # Preserve strides as-is + src_strides = src_desc.strides + dst_strides = dst_desc.strides + + _, src_view = parent_state.sdfg.add_view("view_" + src_node.data, src_shape, src_desc.dtype, src_desc.storage, + src_strides) + _, dst_view = parent_state.sdfg.add_view("view_" + dst_node.data, dst_shape, dst_desc.dtype, dst_desc.storage, + dst_strides) + + # In nested SDFG we add "view_" prefix + view_src_node = parent_state.add_access("view_" + src_node.data) + view_dst_node = parent_state.add_access("view_" + dst_node.data) + + # Create string subset expressions to return + src_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in src_subset]) + dst_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in dst_subset]) + + # Add copy kernel + tasklet, map_entry, map_exit = parent_state.add_mapped_tasklet( + name="gpu_copy_kernel_fallback", + map_ranges=map_ranges, + inputs={"_in": dace.memlet.Memlet(f"{view_src_node.data}[{access_expr}]")}, + outputs={"_out": dace.memlet.Memlet(f"{view_dst_node.data}[{access_expr}]")}, + code="_out = _in", + schedule=dtypes.ScheduleType.GPU_Device, + unroll_map=False, + language=dtypes.Language.Python, + external_edges=True, + propagate=True, + input_nodes={view_src_node.data: view_src_node}, + output_nodes={view_dst_node.data: view_dst_node}, + ) + + return view_src_node, src_subset_expr, view_dst_node, dst_subset_expr + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertExplicitGPUGlobalMemoryCopies(ppl.Pass): + """ + This pass inserts explicit copy tasklets for data transfers that need to be handled + by the GPU and occur outside a kernel (for example, copying data from host memory + to the GPU before executing a kernel). + + It identifies such copy locations and inserts the corresponding tasklets. For each + memlet path describing a copy, the first edge is duplicated: one edge goes from the original + source to the tasklet, and the other from the tasklet to the original destination, while + the original edge is removed. + + This is experimental and could later serve as inspiration for making all copies explicit. + Considerations for future work include allowing tasklets to access array addresses + from connectors and describing in memlets how data will be moved, since currently + tasklets only support value inputs. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = set() + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: + """ + Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling. + Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel + function. + + Parameters + ---------- + sdfg : SDFG + The SDFG to transform by adding out-of-kernel GPU copy tasklets. + pipeline_results : Dict[str, Any] + Results from previous transformation passes, including GPU stream assignments. + + Returns + ------- + dict + Currently returns an empty dictionary. + """ + # Prepare GPU stream + + # Initialize the strategy for copies that occur outside of kernel execution + out_of_kernel_copy = OutOfKernelCopyStrategy() + + # Get all data copies to process the out of kernel copies + copy_worklist = self.find_all_data_copies(sdfg) + + for copy_sdfg, state, src_node, dst_node, edge in copy_worklist: + + copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge) + + # Only insert copy tasklets for GPU related copies occuring out of the + # kernel (i.e. a GPU_device scheduled map) + if not out_of_kernel_copy.applicable(copy_context): + continue + + # If the subset has more than 2 dimensions and is not contiguous (represented as a 1D memcpy) then fallback to a copy kernel + if len(edge.data.subset) > 2 and not edge.data.subset.is_contiguous_subset( + state.sdfg.arrays[edge.data.data]): + + # If other subset is not None, we do not need a nested SDFG + if edge.data.other_subset is None: + # Currently only 1D and 2D copies are supported + map_ranges = dict() + for i, dim in enumerate(edge.data.subset): + map_ranges[f"i{i}"] = f"{dim[0]}:{dim[1]+1}:{dim[2]}" + access_expr = ",".join(f"i{i}" for i in range(len(edge.data.subset))) + + tasklet, map_entry, map_exit = state.add_mapped_tasklet( + name="gpu_copy_kernel_fallback", + map_ranges=map_ranges, + inputs={"_in": dace.memlet.Memlet(f"{src_node.data}[{access_expr}]")}, + outputs={"_out": dace.memlet.Memlet(f"{dst_node.data}[{access_expr}]")}, + code="_out = _in", + schedule=dtypes.ScheduleType.GPU_Device, + unroll_map=False, + language=dtypes.Language.Python, + external_edges=True, + propagate=True, + input_nodes={src_node.data: src_node}, + output_nodes={dst_node.data: dst_node}, + ) + # Add connectors to the out edge of map_entry and in edge of map_exit + state.remove_edge(edge) + else: + view_src_node, src_subset_expr, view_dst_node, dst_subset_expr = create_viewed_copy_kernel( + state, src_node, dst_node, edge) + state.remove_edge(edge) + state.add_edge(src_node, None, view_src_node, "views", + dace.Memlet(f"{src_node.data}[{src_subset_expr}]")) + state.add_edge(view_dst_node, "views", dst_node, None, + dace.Memlet(f"{dst_node.data}[{dst_subset_expr}]")) + else: + # Generatae the copy call + code = out_of_kernel_copy.generate_copy(copy_context) + + # Prepare GPU ustream connectors and the stream to be accessed from the + # GPU stream array + # Create the tasklet and add GPU stream related connectors + tasklet = state.add_tasklet("gpu_copy", {"_in_" + src_node.data}, {"_out_" + dst_node.data}, + code, + language=dtypes.Language.CPP) + + # Put the tasklet in between the edge + dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge + + if memlet.other_subset is None: + src_memlet = copy.deepcopy(memlet) + src_memlet.data = src_node.data + state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, src_memlet) + dst_memlet = copy.deepcopy(memlet) + dst_memlet.data = dst_node.data + state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn, dst_memlet) + state.remove_edge(edge) + else: + src_subset = memlet.subset if edge.data.data == src_node.data else memlet.other_subset + dst_subset = memlet.other_subset if edge.data.data == src_node.data else memlet.subset + state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, + dace.Memlet(data=src_node.data, subset=src_subset)) + state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn, + dace.Memlet(data=dst_node.data, subset=dst_subset)) + state.remove_edge(edge) + + return {} + + def find_all_data_copies( + self, sdfg: SDFG + ) -> List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node, MultiConnectorEdge[dace.Memlet]]]: + """ + Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node, + destination node, and the first memlet edge of in the memlet path between source and destination node. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze for potential data copies. + + Returns + ------- + List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node, MultiConnectorEdge[dace.Memlet]]] + A list of tuples representing the data copy, each containing: + - The SDFG containing the copy + - The state in which the copy occurs + - The source node of the copy + - The destination node of the copy + - The first memlet edge representing the data movement + """ + copy_worklist: List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node, + MultiConnectorEdge[dace.Memlet]]] = [] + visited_edges: Set[MultiConnectorEdge[dace.Memlet]] = set() + + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for edge in state.edges(): + + # Skip edges that were already processed + if edge in visited_edges: + continue + + # Get the memlet path and mark all edges in the path as visited + memlet_path = state.memlet_path(edge) + visited_edges.update(set(memlet_path)) + + # Get source and destination noces + first_edge = memlet_path[0] + last_edge = memlet_path[-1] + src_node = first_edge.src + dst_node = last_edge.dst + + # Skip empty memlets + if first_edge.data.subset is None: + continue + + # Add copy to the worklist + copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) + + return copy_worklist diff --git a/tests/gpu_specialization/explicit_global_memory_copy_test.py b/tests/gpu_specialization/explicit_global_memory_copy_test.py new file mode 100644 index 0000000000..92cefed48a --- /dev/null +++ b/tests/gpu_specialization/explicit_global_memory_copy_test.py @@ -0,0 +1,331 @@ +# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import pytest +import numpy as np +from typing import Tuple +from dace.transformation.passes.gpu_specialization.insert_explicit_gpu_global_memory_copies import InsertExplicitGPUGlobalMemoryCopies + + +def _get_sdfg(name_str: str, dimension: Tuple[int], copy_strides: Tuple[int]) -> dace.SDFG: + sdfg = dace.SDFG(name_str) + state = sdfg.add_state("state0", is_start_block=True) + for arr_name in ["A", "B"]: + sdfg.add_array(arr_name, dimension, dace.float32, dace.dtypes.StorageType.GPU_Global) + a = state.add_access("A") + b = state.add_access("B") + copy_str = ", ".join([f"0:{dimension[i]}:{copy_strides[i]}" for i in range(len(dimension))]) + state.add_edge(a, None, b, None, dace.Memlet(f"A[{copy_str}]")) + sdfg.validate() + return sdfg + + +def _get_sdfg_with_other_subset(name_str: str, dimension: Tuple[int], copy_strides: Tuple[int]) -> dace.SDFG: + sdfg = dace.SDFG(name_str) + state = sdfg.add_state("state0", is_start_block=True) + for arr_name in ["A", "B"]: + sdfg.add_array(arr_name, dimension, dace.float32, dace.dtypes.StorageType.GPU_Global) + a = state.add_access("A") + b = state.add_access("B") + # copy_str = ", ".join([f"0:{dimension[i]}:{copy_strides[i]}" for i in range(len(dimension))]) + src_subset = dace.subsets.Range([((dimension[i] // 2), dimension[i] - 1, copy_strides[i]) + for i in range(len(dimension))]) + dst_subset = dace.subsets.Range([(0, (dimension[i] // 2) - 1, copy_strides[i]) for i in range(len(dimension))]) + state.add_edge(a, None, b, None, dace.Memlet(data="B", subset=dst_subset, other_subset=src_subset)) + sdfg.validate() + return sdfg + + +def _count_tasklets(sdfg: dace.SDFG) -> int: + """Count the number of tasklets in the SDFG.""" + count = 0 + for state in sdfg.nodes(): + for node in state.nodes(): + if isinstance(node, dace.nodes.Tasklet): + count += 1 + return count + + +def _count_nsdfgs(sdfg: dace.SDFG) -> int: + """Count the number of nested SDFGs in the SDFG.""" + count = 0 + for state in sdfg.nodes(): + for node in state.nodes(): + if isinstance(node, dace.nodes.NestedSDFG): + count += 1 + return count + + +@pytest.mark.gpu +def test_1d_copy(): + """Test 1D unit stride copy.""" + import cupy as cp + + dimension = (8, ) + copy_strides = (1, ) + + sdfg = _get_sdfg("test_1d_copy", dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + expected = A[::copy_strides[0]] + cp.testing.assert_array_equal(B, expected) + assert num_tasklets == 1 + + +@pytest.mark.gpu +def test_1d_copy_w_other_subset(): + """Test 1D unit stride copy.""" + import cupy as cp + + dimension = (8, ) + copy_strides = (1, ) + + sdfg = _get_sdfg_with_other_subset("test_1d_copy_w_other_subset", dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + sdfg.save("x.sdfg") + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + expected = A[4:8:copy_strides[0]] + cp.testing.assert_array_equal(B[0:4], expected) + assert num_tasklets == 1 + + +@pytest.mark.gpu +def test_2d_copy(): + """Test 2D unit stride copy with other subset not None.""" + import cupy as cp + + dimension = (8, 8) + copy_strides = (1, 1) + + sdfg = _get_sdfg("test_2d_copy", dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + assert num_tasklets == 1 + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + expected = A[::copy_strides[0], ::copy_strides[1]] + cp.testing.assert_array_equal(B, expected) + + assert num_tasklets == 1 + + +@pytest.mark.gpu +def test_2d_copy_with_other_subset(): + """Test 2D unit stride copy with other subset not None.""" + import cupy as cp + + dimension = (8, 8) + copy_strides = (1, 1) + + sdfg = _get_sdfg_with_other_subset("test_2d_copy_with_other_subset", dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + expected = A[4:8:copy_strides[0], 4:8:copy_strides[1]] + cp.testing.assert_array_equal(B[0:4, 0:4], expected) + assert num_tasklets == 1 + + +@pytest.mark.gpu +def test_3d_copy(): + """Test 3D unit stride copy.""" + import cupy as cp + + dimension = (8, 4, 4) + copy_strides = (1, 1, 1) + + sdfg = _get_sdfg("test_3d_copy", dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + expected = A[::copy_strides[0], ::copy_strides[1], ::copy_strides[2]] + cp.testing.assert_array_equal(B, expected) + + assert num_tasklets == 1 + + +@pytest.mark.gpu +@pytest.mark.parametrize("stride", [2, 4]) +def test_1d_strided_copy(stride): + """Test 1D strided copy with varying strides.""" + import cupy as cp + + dimension = (8, ) + copy_strides = (stride, ) + + sdfg = _get_sdfg(f"test_1d_strided_copy_s{stride}", dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + assert num_tasklets == 1 + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness - only elements at stride intervals should be copied + expected = cp.zeros_like(A) + expected[::stride] = A[::stride] + cp.testing.assert_array_equal(B[::stride], expected[::stride]) + + +@pytest.mark.gpu +@pytest.mark.parametrize("stride_1,stride_2", [(2, 1), (4, 1), (1, 2), (1, 4)]) +def test_2d_strided_copy(stride_1, stride_2): + """Test 2D strided copy. First dimension is unit stride, second is strided.""" + import cupy as cp + + dimension = (8, 4) + copy_strides = (stride_1, stride_2) + + sdfg = _get_sdfg(f"test_2d_strided_copy_s{stride_1}_{stride_2}", dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + assert num_tasklets == 1 + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + expected = cp.zeros_like(A) + expected[::stride_1, ::stride_2] = A[::stride_1, ::stride_2] + cp.testing.assert_array_equal(B[::stride_1, ::stride_2], expected[::stride_1, ::stride_2]) + + +@pytest.mark.gpu +@pytest.mark.parametrize("stride_1,stride_2,stride_3", [(1, 2, 2), (1, 2, 4), (1, 4, 2), (4, 1, 1), (4, 2, 1), + (2, 2, 1)]) +def test_3d_strided_copy(stride_1, stride_2, stride_3): + """Test 3D strided copy. First dimension is unit stride, others are strided.""" + import cupy as cp + + dimension = (8, 4, 4) + copy_strides = (stride_1, stride_2, stride_3) + + sdfg = _get_sdfg(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}", dimension, copy_strides) + sdfg.save("x1.sdfg") + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + sdfg.save("x2.sdfg") + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + assert num_tasklets == 1 + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + expected = cp.zeros_like(A) + expected[::stride_1, ::stride_2, ::stride_3] = A[::stride_1, ::stride_2, ::stride_3] + cp.testing.assert_array_equal(B, expected) + + +@pytest.mark.gpu +@pytest.mark.parametrize("stride_1,stride_2,stride_3", [ + (1, 2, 2), + (1, 2, 4), + (1, 4, 2), + (2, 2, 1), +]) +def test_3d_strided_copy_w_other_subset(stride_1, stride_2, stride_3): + """Test 3D strided copy. First dimension is unit stride, others are strided.""" + import cupy as cp + + dimension = (8, 8, 8) + copy_strides = (stride_1, stride_2, stride_3) + + sdfg = _get_sdfg_with_other_subset(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}_w_other_subset", + dimension, copy_strides) + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + + # Count tasklets + num_tasklets = _count_tasklets(sdfg) + assert num_tasklets == 1 + + # Test with cupy + A = cp.random.rand(*dimension).astype(np.float32) + B = cp.zeros_like(A) + + sdfg(A=A, B=B) + + # Verify correctness + cp.testing.assert_array_equal(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]], + A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]]) + + +@pytest.mark.gpu +def test_independent_copies(): + + @dace.program + def independent_copies(A: dace.uint32[128], B: dace.uint32[128], C: dace.uint32[128], D: dace.uint32[128]): + for i in dace.map[0:128:1]: + B[i] = A[i] + for i in dace.map[0:128:1]: + D[i] = C[i] + + sdfg = independent_copies.to_sdfg() + sdfg.apply_gpu_transformations() + sdfg.validate() + sdfg.save("s1.sdfg") + + InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {}) + sdfg.save("s2.sdfg") + + sdfg.validate() + sdfg.compile()