From f519b2674c6335aff5cbf642e4aad78600dcd4ad Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Tue, 6 Jan 2026 15:15:42 +0100
Subject: [PATCH 01/14] Extend cases supported by the explicit copy
 transformations

---
 .../gpu_specialization_utilities/__init__.py  |   0
 .../copy_strategies.py                        | 549 ++++++++++++++++++
 .../gpu_specialization_utilities/gpu_utils.py |  27 +
 .../passes/gpu_specialization/__init__.py     |   0
 ...nsert_explicit_gpu_global_memory_copies.py | 256 ++++++++
 .../explicit_global_memory_copy_test.py       | 328 +++++++++++
 6 files changed, 1160 insertions(+)
 create mode 100644 dace/codegen/gpu_specialization_utilities/__init__.py
 create mode 100644 dace/codegen/gpu_specialization_utilities/copy_strategies.py
 create mode 100644 dace/codegen/gpu_specialization_utilities/gpu_utils.py
 create mode 100644 dace/transformation/passes/gpu_specialization/__init__.py
 create mode 100644 dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
 create mode 100644 tests/gpu_specialization/explicit_global_memory_copy_test.py

diff --git a/dace/codegen/gpu_specialization_utilities/__init__.py b/dace/codegen/gpu_specialization_utilities/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dace/codegen/gpu_specialization_utilities/copy_strategies.py b/dace/codegen/gpu_specialization_utilities/copy_strategies.py
new file mode 100644
index 0000000000..685d0c11db
--- /dev/null
+++ b/dace/codegen/gpu_specialization_utilities/copy_strategies.py
@@ -0,0 +1,549 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from dace import SDFG, SDFGState, data, dtypes, subsets
+from dace import memlet as mm
+from dace import symbolic
+from dace.codegen import common
+from dace.codegen.targets import cpp
+from dace.codegen.targets.cpp import sym2cpp
+from dace.codegen.gpu_specialization_utilities.gpu_utils import generate_sync_debug_call
+from dace.config import Config
+from dace.dtypes import StorageType
+from dace.frontend import operations
+from dace.sdfg import nodes, scope_contains_scope
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.transformation import helpers
+
+
+class CopyContext:
+    """
+    Encapsulates inputs required for copy operations and exposes helper
+    methods to derive additional information. This keeps copy strategies
+    lightweight by letting them focus only on the relevant logic.
+    """
+
+    def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node,
+                 edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]):
+
+        # Store the basic context as attributes
+        self.sdfg = sdfg
+        self.state = state
+        self.src_node = src_node
+        self.dst_node = dst_node
+        self.edge = edge
+        self.gpustream_assignments = gpustream_assignments
+
+        memlet = edge.data
+
+        self.copy_shape = memlet.subset.size_exact()
+        if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode):
+            copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info()
+        else:
+            copy_shape = memlet.subset.size_exact()
+            src_strides = dst_strides = src_expr = dst_expr = None
+
+        self.copy_shape = copy_shape
+        self.src_strides = src_strides
+        self.dst_strides = dst_strides
+        self.src_expr = src_expr
+        self.dst_expr = dst_expr
+
+    def get_storage_type(self, node: nodes.Node):
+        """
+        Return the storage type associated with a given SDFG node.
+
+        Tasklets are assumed to use register storage, while AccessNodes
+        return the storage type from their data descriptor. Raises
+        NotImplementedError for unsupported node types.
+        """
+        if isinstance(node, nodes.Tasklet):
+            storage_type = StorageType.Register
+
+        elif isinstance(node, nodes.AccessNode):
+            storage_type = node.desc(self.sdfg).storage
+
+        else:
+            raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; "
+                                      "expected AccessNode or Tasklet. Please extend this method accordingly.")
+
+        return storage_type
+
+    def get_assigned_gpustream(self) -> str:
+        """
+        Return the GPU stream expression assigned to both source and destination nodes.
+
+        Ensures that both nodes have a matching stream ID, then constructs the
+        variable name from the configured prefix and stream ID. Raises ValueError
+        if assignments are missing or inconsistent.
+
+        Example:
+            If the configured prefix is 'gpu_stream' and the assigned stream ID is 0,
+            this method returns 'gpu_stream0'.
+        """
+        src_stream = self.gpustream_assignments.get(self.src_node)
+        dst_stream = self.gpustream_assignments.get(self.dst_node)
+
+        # 1. Catch unsupported cases
+        if src_stream is None or dst_stream is None:
+            raise ValueError("GPU stream assignment missing for source or destination node.")
+
+        if src_stream != dst_stream:
+            raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', "
+                             f"dst_node has '{dst_stream}'. They must be the same.")
+
+        # 2. Generate GPU stream expression
+        gpustream = src_stream
+        # gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
+        gpustream_expr = f"{gpustream}" # {gpustream_var_name_prefix}
+
+        return gpustream_expr
+
+    def get_memory_location(self) -> Tuple[str, str]:
+        """
+        Determine whether the source and destination nodes reside in device or host memory.
+
+        Uses the storage type of each node to classify it as either 'Device'
+        (GPU global memory) or 'Host' (all other storage types).
+        Used for GPU related copies outside the kernel (e.g. to construct
+        cudaMemcpyHostToDevice for example).
+
+        Returns
+        -------
+        Tuple[str, str]
+            (src_location, dst_location) where each is either 'Device' or 'Host'.
+        """
+        src_storage = self.get_storage_type(self.src_node)
+        dst_storage = self.get_storage_type(self.dst_node)
+        src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host'
+        dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host'
+
+        return src_location, dst_location
+
+    def get_ctype(self) -> Any:
+        """
+        Determine the C data type (ctype) of the source or destination node.
+
+        The ctype is resolved from the data descriptor of the first node
+        (source or destination) that is an AccessNode (assumed to be the same
+        if both are AccessNodes).
+
+        Returns
+        -------
+        Any
+            The C type string (e.g., "float*", "int32") associated with the node.
+
+        Raises
+        ------
+        NotImplementedError
+            If neither the source nor the destination node is an AccessNode.
+        """
+        sdfg = self.sdfg
+        src_node, dst_node = self.src_node, self.dst_node
+
+        if isinstance(src_node, nodes.AccessNode):
+            return src_node.desc(sdfg).ctype
+
+        if isinstance(dst_node, nodes.AccessNode):
+            return dst_node.desc(sdfg).ctype
+
+        raise NotImplementedError(
+            f"Cannot determine ctype: neither src nor dst node is an AccessNode. "
+            f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. "
+            "Please extend this case or fix the issue.")
+
+    def get_accessnode_to_accessnode_copy_info(self):
+        """
+        Compute copy shape, absolute strides, and pointer expressions for a copy
+        between two AccessNodes. Tries to mimic
+        cpp.memlet_copy_to_absolute_strides without requiring a dispatcher.
+
+        Returns
+        -------
+        (copy_shape, src_strides, dst_strides, src_expr, dst_expr)
+
+        Raises
+        ------
+        TypeError
+            If either endpoint is not an AccessNode.
+        NotImplementedError
+            If a descriptor is not Scalar or Array.
+        """
+
+        # ---------------------------- helpers ----------------------------
+        def _collapse_strides(strides, subset):
+            """Remove size-1 dims; keep tile strides; default to [1] if none remain."""
+            n = len(subset)
+            collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1]
+            collapsed.extend(strides[n:])  # include tiles
+            if len(collapsed) == 0:
+                return [1]
+            return collapsed
+
+        def _ptr_name(desc, name):
+            if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent,
+                                                    dtypes.AllocationLifetime.External):
+                return f'__state->__{sdfg.cfg_id}_{name}'
+            return name
+
+        def _expr_for(desc, name, subset):
+            ptr = _ptr_name(desc, name)
+
+            if isinstance(desc, data.Scalar):
+                # GPU scalar special-case
+                if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN:
+                    parent = state.sdfg.parent_nsdfg_node
+                    if parent is not None and name in parent.in_connectors:
+                        return f"&{ptr}"
+                    return ptr
+                # CPU (or other) scalars
+                return f"&{ptr}"
+
+            if isinstance(desc, data.Array):
+                offset = cpp.cpp_offset_expr(desc, subset)
+                return f"{ptr} + {offset}" if offset != "0" else ptr
+
+            raise NotImplementedError(
+                f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.")
+
+        # ---------------------------- Get copy info ----------------------------
+        # Get needed information
+        src_node, dst_node = self.src_node, self.dst_node
+        sdfg, edge, state = self.sdfg, self.edge, self.state
+        memlet, copy_shape = self.edge.data, self.copy_shape
+
+        # Guard - only applicable if src and dst are AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            raise TypeError(
+                f"get_accessnode_to_accessnode_copy_info requires both source and destination "
+                f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.")
+
+        # Get node descriptors
+        src_nodedesc = src_node.desc(sdfg)
+        dst_nodedesc = dst_node.desc(sdfg)
+
+        # Resolve subsets (fallback to full range)
+        src_subset = memlet.get_src_subset(edge, state)
+        dst_subset = memlet.get_dst_subset(edge, state)
+
+        if src_subset is None:
+            src_subset = subsets.Range.from_array(src_nodedesc)
+
+        if dst_subset is None:
+            dst_subset = src_subset
+            # dst_subset = subsets.Range.from_array(dst_nodedesc)
+
+        # Get strides
+        src_strides = src_subset.absolute_strides(src_nodedesc.strides)
+        dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides)
+
+        # Try to convert to a degenerate/strided ND copy first
+        result = cpp.ndcopy_to_strided_copy(
+            copy_shape,
+            src_nodedesc.shape,
+            src_strides,
+            dst_nodedesc.shape,
+            dst_strides,
+            memlet.subset,
+            src_subset,
+            dst_subset,
+        )
+
+        if result is not None:
+            copy_shape, src_strides, dst_strides = result
+        else:
+            src_strides = _collapse_strides(src_strides, src_subset)
+            dst_strides = _collapse_strides(dst_strides, dst_subset)
+            copy_shape = [s for s in copy_shape if s != 1] or [1]
+
+        # Extend copy shape to the largest among the data dimensions,
+        # and extend other array with the appropriate strides
+        if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape):
+            if memlet.data == src_node.data:
+                copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape)
+            elif memlet.data == dst_node.data:
+                copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape)
+
+        return copy_shape, src_strides, dst_strides, src_node.data, dst_node.data
+
+
+class CopyStrategy(ABC):
+    """Abstract base class for memory copy strategies."""
+
+    @abstractmethod
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Return True if this strategy can handle the given memory copy.
+        """
+        raise NotImplementedError('Abstract class')
+
+    @abstractmethod
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates and returns the copy code for the supported pattern.
+        """
+        raise NotImplementedError('Abstract class')
+
+
+class OutOfKernelCopyStrategy(CopyStrategy):
+    """
+    Copy strategy for memory transfers that occur outside of kernel execution.
+
+    This pattern often occurs when generating host-to-device copies for kernel inputs
+    (since kernels cannot access host memory directly), and device-to-host copies
+    to retrieve results for further processing.
+    """
+
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Determines whether the data movement is a host<->device memory copy.
+
+        This function returns True if:
+        - We are not currently generating kernel code
+        - The copy occurs between two AccessNodes
+        - The data descriptors of source and destination are not views.
+        - The storage types of either src or dst is CPU_Pinned or GPU_Device
+        - We do not have a CPU-to-CPU copy
+        """
+        # Retrieve needed information
+        state = copy_context.state
+        src_node, dst_node = copy_context.src_node, copy_context.dst_node
+
+        # 1. Ensure copy is not occuring within a kernel
+        scope_dict = state.scope_dict()
+        deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node
+
+        parent_map_tuple = helpers.get_parent_map(state, deeper_node)
+        while parent_map_tuple is not None:
+            parent_map, parent_state = parent_map_tuple
+            if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN:
+                return False
+            else:
+                parent_map_tuple = helpers.get_parent_map(parent_state, parent_map)
+
+        # 2. Check whether copy is between two AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            return False
+
+        # 3. The data descriptors of source and destination are not views
+        if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View):
+            return False
+
+        # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device
+        src_storage = copy_context.get_storage_type(src_node)
+        dst_storage = copy_context.get_storage_type(dst_node)
+        if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)
+                or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)):
+            return False
+
+        # 5. Check that this is not a CPU to CPU copy
+        cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned]
+        if src_storage in cpu_storage_types and dst_storage in cpu_storage_types:
+            return False
+
+        return True
+
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """Execute host-device copy with CUDA memory operations"""
+
+        # Guard
+        memlet = copy_context.edge.data
+        if memlet.wcr is not None:
+            src_location, dst_location = copy_context.get_memory_location()
+            raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented')
+
+        # Based on the copy dimension, call appropiate helper function
+        num_dims = len(copy_context.copy_shape)
+        if num_dims == 1:
+            copy_call = self._generate_1d_copy(copy_context)
+
+        elif num_dims == 2:
+            copy_call = self._generate_2d_copy(copy_context)
+
+        else:
+            # sanity check
+            assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}."
+            copy_call = self._generate_nd_copy(copy_context)
+
+        return copy_call
+
+    def _generate_1d_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates a 1D memory copy between host and device using the GPU backend.
+
+        Uses {backend}MemcpyAsync for contiguous memory. For strided memory,
+        {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension.
+        """
+        # ----------- Retrieve relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1)
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call --------------------
+
+        if is_contiguous_copy:
+            # Memory is linear: can use {backend}MemcpyAsync
+            copysize = ' * '.join(sym2cpp(copy_shape))
+            copysize += f' * sizeof({ctype})'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+            call = f'DACE_GPU_CHECK({backend}MemcpyAsync(_out_{dst_expr}, _in_{src_expr}, {copysize}, {kind}, {gpustream}));\n'
+
+        else:
+            # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch
+            # This allows copying a strided 1D region
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call = call + generate_sync_debug_call()
+        return call
+
+    def _generate_2d_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates a 2D memory copy using {backend}Memcpy2DAsync.
+
+        Three main cases are handled:
+        - Copy between row-major stored arrays with contiguous rows.
+        - Copy between column-major stored arrays with contiguous columns.
+        - A special case where a 2D copy can still be represented.
+
+        Raises:
+            NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns.
+            Such cases indicate an unsupported 2D copy and should be examined separately.
+            They can be implemented if valid, or a more descriptive error should be raised if the path should not occur.
+
+        Note:
+            {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column),
+            but not both simultaneously.
+        """
+
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call if supported --------------------
+        # Case: Row-major layout, rows are not strided.
+        if (src_strides[1] == 1) and (dst_strides[1] == 1):
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[0])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Case: Column-major layout, no columns are strided.
+        elif (src_strides[0] == 1) and (dst_strides[0] == 1):
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[1])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Special case
+        elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]):
+            # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with
+            # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a
+            # {backend}Memcpy2DAsync call!
+
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0] * copy_shape[1])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        else:
+            raise NotImplementedError(
+                f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}."
+                "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken."
+            )
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call = call + generate_sync_debug_call()
+        return call
+
+    def _generate_nd_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates GPU code for copying N-dimensional arrays using 2D memory copies.
+
+        Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops
+        for any outer dimensions. Expects the copy to be contiguous and between
+        row-major storage locations.
+        """
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+        num_dims = len(copy_shape)
+
+        # ----------- Guard for unsupported Pattern --------------
+        if not (src_strides[-1] == 1) and (dst_strides[-1] == 1):
+            src_node, dst_node = copy_context.src_node, copy_context.dst_node
+            src_storage = copy_context.get_storage_type(src_node)
+            dst_storage = copy_context.get_storage_type(dst_node)
+            raise NotImplementedError(
+                "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n"
+                f"  Source node: {src_node} (storage: {src_storage})\n"
+                f"  Destination node: {copy_context.dst_node} (storage: {dst_storage})\n"
+                f"  Source strides: {src_strides}\n"
+                f"  Destination strides: {dst_strides}\n"
+                f"  copy shape: {copy_shape}\n")
+
+        # ----------------- Generate and write backend call(s) --------------------
+
+        call = ""
+        # Write for-loop headers
+        for dim in range(num_dims - 2):
+            call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n"
+
+        # Write Memcopy2DAsync
+        offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2]))
+        offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2]))
+
+        src = f'{src_expr} + {offset_src}'
+        dst = f'{dst_expr} + {offset_dst}'
+
+        dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})'
+        spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})'
+        width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})'
+        height = sym2cpp(copy_shape[-2])
+        kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+        # Generate call and write it
+        call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst}, {dpitch}, _in_{src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call += generate_sync_debug_call()
+
+        # Write for-loop footers
+        for dim in range(num_dims - 2):
+            call += "\n}"
+
+        # Return the code
+        return call
\ No newline at end of file
diff --git a/dace/codegen/gpu_specialization_utilities/gpu_utils.py b/dace/codegen/gpu_specialization_utilities/gpu_utils.py
new file mode 100644
index 0000000000..4c742aaee5
--- /dev/null
+++ b/dace/codegen/gpu_specialization_utilities/gpu_utils.py
@@ -0,0 +1,27 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+from dace import Config
+from dace.codegen import common
+
+
+def generate_sync_debug_call() -> str:
+    """
+    Generate backend sync and error-check calls as a string if
+    synchronous debugging is enabled.
+
+    Parameters
+    ----------
+    backend : str
+        Backend API prefix (e.g., 'cuda').
+
+    Returns
+    -------
+    str
+        The generated debug call code, or an empty string if debugging is disabled.
+    """
+    backend: str = common.get_gpu_backend()
+    sync_call: str = ""
+    if Config.get_bool('compiler', 'cuda', 'syncdebug'):
+        sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
+                     f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")
+
+    return sync_call
\ No newline at end of file
diff --git a/dace/transformation/passes/gpu_specialization/__init__.py b/dace/transformation/passes/gpu_specialization/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
new file mode 100644
index 0000000000..48ee409f39
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
@@ -0,0 +1,256 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import SDFG, SDFGState, dtypes, properties
+from dace import memlet as mm
+from dace.codegen.gpu_specialization_utilities.copy_strategies import CopyContext, OutOfKernelCopyStrategy
+from dace.config import Config
+from dace.sdfg import nodes, scope_contains_scope
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.transformation import pass_pipeline as ppl, transformation
+
+
+def create_sdfg_with_copy_only(parent_state, parent_src_node, parent_dst_node, edge) -> dace.SDFG:
+    sdfg = dace.SDFG("copy_subsdfg")
+    state = sdfg.add_state("copy_state", is_start_block=True)
+
+    # Currently only 1D and 2D copies are supported
+    map_ranges = dict()
+    for i, dim in enumerate(edge.data.subset):
+        map_ranges[f"i{i}"] = f"0:{dim[1]+1-dim[0]}:{dim[2]}"
+
+    access_expr = ",".join(f"i{i}" for i in range(len(edge.data.subset)))
+
+    src_desc = parent_state.sdfg.arrays[parent_src_node.data]
+    dst_desc = parent_state.sdfg.arrays[parent_dst_node.data]
+
+    # In nested SDFG we add "view_" prefix
+    src_node = state.add_access("view_" + parent_src_node.data)
+    dst_node = state.add_access("view_" + parent_dst_node.data)
+
+    # Add new arrays for the copy SDFG
+    # Determine src and dst subsets
+    src_subset = edge.data.subset if edge.data.data == parent_src_node.data else edge.data.other_subset
+    dst_subset = edge.data.other_subset if edge.data.data == parent_src_node.data else edge.data.subset
+
+    # Collect the new shapes
+    src_shape = [e + 1 - b for b, e, s in src_subset]
+    dst_shape = [e + 1 - b for b, e, s in dst_subset]
+
+    # Preserve strides as-is
+    src_strides = src_desc.strides
+    dst_strides = dst_desc.strides
+
+    # Create string subset expressions to return
+    src_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in src_subset])
+    dst_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in dst_subset])
+
+    # Add arrays as views
+    sdfg.add_array("view_" + parent_src_node.data, src_shape, src_desc.dtype, src_desc.storage, None, False, src_strides)
+    sdfg.add_array("view_" + parent_dst_node.data, dst_shape, dst_desc.dtype, dst_desc.storage, None, False, dst_strides)
+
+    # Add copy kernel
+    tasklet, map_entry, map_exit = state.add_mapped_tasklet(
+        name="gpu_copy_kernel_fallback",
+        map_ranges=map_ranges,
+        inputs={"_in": dace.memlet.Memlet(f"{src_node.data}[{access_expr}]")},
+        outputs={"_out": dace.memlet.Memlet(f"{dst_node.data}[{access_expr}]")},
+        code="_out = _in",
+        schedule=dtypes.ScheduleType.GPU_Device,
+        unroll_map=False,
+        language=dtypes.Language.Python,
+        external_edges=True,
+        propagate=True,
+        input_nodes={src_node.data: src_node},
+        output_nodes={dst_node.data: dst_node},
+    )
+
+    return sdfg, src_subset_expr, dst_subset_expr
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertExplicitGPUGlobalMemoryCopies(ppl.Pass):
+    """
+    This pass inserts explicit copy tasklets for data transfers that need to be handled
+    by the GPU and occur outside a kernel (for example, copying data from host memory
+    to the GPU before executing a kernel).
+
+    It identifies such copy locations and inserts the corresponding tasklets. For each
+    memlet path describing a copy, the first edge is duplicated: one edge goes from the original
+    source to the tasklet, and the other from the tasklet to the original destination, while
+    the original edge is removed.
+
+    This is experimental and could later serve as inspiration for making all copies explicit.
+    Considerations for future work include allowing tasklets to access array addresses
+    from connectors and describing in memlets how data will be moved, since currently
+    tasklets only support value inputs.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        depending_passes = set()
+        return depending_passes
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Tasklets | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
+        """
+        Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling.
+        Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel
+        function.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to transform by adding out-of-kernel GPU copy tasklets.
+        pipeline_results : Dict[str, Any]
+            Results from previous transformation passes, including GPU stream assignments.
+
+        Returns
+        -------
+        dict
+            Currently returns an empty dictionary.
+        """
+        # Prepare GPU stream
+
+        # gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+        gpustream_assignments: Dict[nodes.Node, Union[int, str]] = dict()
+
+        # Initialize the strategy for copies that occur outside of kernel execution
+        out_of_kernel_copy = OutOfKernelCopyStrategy()
+
+        # Get all data copies to process the out of kernel copies
+        copy_worklist = self.find_all_data_copies(sdfg)
+
+        for copy_sdfg, state, src_node, dst_node, edge in copy_worklist:
+            gpustream_assignments[src_node] = "__dace_current_stream"
+            gpustream_assignments[dst_node] = "__dace_current_stream"
+
+        for copy_sdfg, state, src_node, dst_node, edge in copy_worklist:
+
+            copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments)
+
+            # Only insert copy tasklets for GPU related copies occuring out of the
+            # kernel (i.e. a GPU_device scheduled map)
+            if not out_of_kernel_copy.applicable(copy_context):
+                continue
+            
+            # If the subset has more than 2 dimensions and is not contiguous (represented as a 1D memcpy) then fallback to a copy kernel
+            if len(edge.data.subset) > 2 and not edge.data.subset.is_contiguous_subset(state.sdfg.arrays[edge.data.data]):
+
+                # If other subset is not None, we do not need a nested SDFG
+                if edge.data.other_subset is None:
+                    # Currently only 1D and 2D copies are supported
+                    map_ranges = dict()
+                    for i, dim in enumerate(edge.data.subset):
+                        map_ranges[f"i{i}"] = f"{dim[0]}:{dim[1]+1}:{dim[2]}"
+                    access_expr = ",".join(f"i{i}" for i in range(len(edge.data.subset)))
+
+                    tasklet, map_entry, map_exit = state.add_mapped_tasklet(
+                        name="gpu_copy_kernel_fallback",
+                        map_ranges=map_ranges,
+                        inputs={"_in": dace.memlet.Memlet(f"{src_node.data}[{access_expr}]")},
+                        outputs={"_out": dace.memlet.Memlet(f"{dst_node.data}[{access_expr}]")},
+                        code="_out = _in",
+                        schedule=dtypes.ScheduleType.GPU_Device,
+                        unroll_map=False,
+                        language=dtypes.Language.Python,
+                        external_edges=True,
+                        propagate=True,
+                        input_nodes={src_node.data: src_node},
+                        output_nodes={dst_node.data: dst_node},
+                    )
+                    # Add connectors to the out edge of map_entry and in edge of map_exit
+                    state.remove_edge(edge)
+                else:
+                    copy_sdfg, src_subset_expr, dst_subset_expr = create_sdfg_with_copy_only(state, src_node, dst_node, edge)
+                    nsdfg = state.add_nested_sdfg(
+                        sdfg=copy_sdfg,
+                        inputs={"view_" + src_node.data},
+                        outputs={"view_" + dst_node.data},
+                    )
+                    state.remove_edge(edge)
+                    state.add_edge(src_node, None, nsdfg, "view_" + src_node.data, dace.Memlet(f"{src_node.data}[{src_subset_expr}]"))
+                    state.add_edge(nsdfg, "view_" + dst_node.data, dst_node, None, dace.Memlet(f"{dst_node.data}[{dst_subset_expr}]"))
+            else:
+                # Generatae the copy call
+                code = out_of_kernel_copy.generate_copy(copy_context)
+
+                # Prepare GPU ustream connectors and the stream to be accessed from the
+                # GPU stream array
+                # Create the tasklet and add GPU stream related connectors
+                tasklet = state.add_tasklet("gpu_copy", { "_in_" + src_node.data }, { "_out_" + dst_node.data }, code, language=dtypes.Language.CPP)
+
+                # Put the tasklet in between the edge
+                dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge
+
+                if memlet.other_subset is None:
+                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, copy.deepcopy(memlet))
+                    dst_memlet = copy.deepcopy(memlet)
+                    dst_memlet.data = dst_node.data
+                    state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn, dst_memlet)
+                    state.remove_edge(edge)
+                else:
+                    src_subset = memlet.subset if edge.data.data == src_node.data else memlet.other_subset
+                    dst_subset = memlet.other_subset if edge.data.data == src_node.data else memlet.subset
+                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, dace.Memlet(data=src_node.data, subset=src_subset))
+                    state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn, dace.Memlet(data=dst_node.data, subset=dst_subset))
+                    state.remove_edge(edge)
+
+        return {}
+
+    def find_all_data_copies(
+            self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]:
+        """
+        Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node,
+        destination node, and the first memlet edge of in the memlet path between source and destination node.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to analyze for potential data copies.
+
+        Returns
+        -------
+        List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]
+            A list of tuples representing the data copy, each containing:
+            - The SDFG containing the copy
+            - The state in which the copy occurs
+            - The source node of the copy
+            - The destination node of the copy
+            - The first memlet edge representing the data movement
+        """
+        copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = []
+        visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set()
+
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+            for state in sub_sdfg.states():
+                for edge in state.edges():
+
+                    # Skip edges that were already processed
+                    if edge in visited_edges:
+                        continue
+
+                    # Get the memlet path and mark all edges in the path as visited
+                    memlet_path = state.memlet_path(edge)
+                    visited_edges.update(set(memlet_path))
+
+                    # Get source and destination noces
+                    first_edge = memlet_path[0]
+                    last_edge = memlet_path[-1]
+                    src_node = first_edge.src
+                    dst_node = last_edge.dst
+
+                    # Skip empty memlets
+                    if first_edge.data.subset is None:
+                        continue
+
+                    # Add copy to the worklist
+                    copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge))
+
+        return copy_worklist
\ No newline at end of file
diff --git a/tests/gpu_specialization/explicit_global_memory_copy_test.py b/tests/gpu_specialization/explicit_global_memory_copy_test.py
new file mode 100644
index 0000000000..989c1575d0
--- /dev/null
+++ b/tests/gpu_specialization/explicit_global_memory_copy_test.py
@@ -0,0 +1,328 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import pytest
+import numpy as np
+from typing import Tuple
+from dace.transformation.passes.gpu_specialization.insert_explicit_gpu_global_memory_copies import InsertExplicitGPUGlobalMemoryCopies
+
+def _get_sdfg(name_str: str, dimension: Tuple[int], copy_strides: Tuple[int]) -> dace.SDFG:
+    sdfg = dace.SDFG(name_str)
+    state = sdfg.add_state("state0", is_start_block=True)
+    for arr_name in ["A", "B"]:
+        sdfg.add_array(arr_name, dimension, dace.float32, dace.dtypes.StorageType.GPU_Global)
+    a = state.add_access("A")
+    b = state.add_access("B")
+    copy_str = ", ".join([f"0:{dimension[i]}:{copy_strides[i]}" for i in range(len(dimension))])
+    state.add_edge(a, None, b, None, dace.Memlet(f"A[{copy_str}]"))
+    sdfg.validate()
+    return sdfg
+
+
+def _get_sdfg_with_other_subset(name_str: str, dimension: Tuple[int], copy_strides: Tuple[int]) -> dace.SDFG:
+    sdfg = dace.SDFG(name_str)
+    state = sdfg.add_state("state0", is_start_block=True)
+    for arr_name in ["A", "B"]:
+        sdfg.add_array(arr_name, dimension, dace.float32, dace.dtypes.StorageType.GPU_Global)
+    a = state.add_access("A")
+    b = state.add_access("B")
+    # copy_str = ", ".join([f"0:{dimension[i]}:{copy_strides[i]}" for i in range(len(dimension))])
+    src_subset = dace.subsets.Range([((dimension[i] // 2), dimension[i] - 1, copy_strides[i]) for i in range(len(dimension))])
+    dst_subset = dace.subsets.Range([(0, (dimension[i] // 2) - 1, copy_strides[i]) for i in range(len(dimension))])
+    state.add_edge(a, None, b, None, dace.Memlet(data="B", subset=dst_subset, other_subset=src_subset))
+    sdfg.validate()
+    return sdfg
+
+def _count_tasklets(sdfg: dace.SDFG) -> int:
+    """Count the number of tasklets in the SDFG."""
+    count = 0
+    for state in sdfg.nodes():
+        for node in state.nodes():
+            if isinstance(node, dace.nodes.Tasklet):
+                count += 1
+    return count
+
+def _count_nsdfgs(sdfg: dace.SDFG) -> int:
+    """Count the number of nested SDFGs in the SDFG."""
+    count = 0
+    for state in sdfg.nodes():
+        for node in state.nodes():
+            if isinstance(node, dace.nodes.NestedSDFG):
+                count += 1
+    return count
+
+@pytest.mark.gpu
+def test_1d_copy():
+    """Test 1D unit stride copy."""
+    import cupy as cp
+
+    dimension = (8,)
+    copy_strides = (1,)
+
+    sdfg = _get_sdfg("test_1d_copy", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[::copy_strides[0]]
+    cp.testing.assert_array_equal(B, expected)
+    assert num_tasklets == 1
+
+
+@pytest.mark.gpu
+def test_1d_copy_w_other_subset():
+    """Test 1D unit stride copy."""
+    import cupy as cp
+
+    dimension = (8,)
+    copy_strides = (1,)
+
+    sdfg = _get_sdfg_with_other_subset("test_1d_copy_w_other_subset", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+    sdfg.save("x.sdfg")
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[4:8:copy_strides[0]]
+    cp.testing.assert_array_equal(B[0:4], expected)
+    assert num_tasklets == 1
+
+
+@pytest.mark.gpu
+def test_2d_copy():
+    """Test 2D unit stride copy with other subset not None."""
+    import cupy as cp
+
+    dimension = (8, 8)
+    copy_strides = (1, 1)
+
+    sdfg = _get_sdfg("test_2d_copy", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[::copy_strides[0], ::copy_strides[1]]
+    cp.testing.assert_array_equal(B, expected)
+
+    assert num_tasklets == 1
+
+    print(f"2D copy: {num_tasklets} tasklets")
+
+@pytest.mark.gpu
+def test_2d_copy_with_other_subset():
+    """Test 2D unit stride copy with other subset not None."""
+    import cupy as cp
+
+    dimension = (8, 8)
+    copy_strides = (1, 1)
+
+    sdfg = _get_sdfg_with_other_subset("test_2d_copy_with_other_subset", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[4:8:copy_strides[0], 4:8:copy_strides[1]]
+    cp.testing.assert_array_equal(B[0:4, 0:4], expected)
+    assert num_tasklets == 1
+
+    print(f"2D copy: {num_tasklets} tasklets")
+
+@pytest.mark.gpu
+def test_3d_copy():
+    """Test 3D unit stride copy."""
+    import cupy as cp
+
+    dimension = (8, 4, 4)
+    copy_strides = (1, 1, 1)
+
+    sdfg = _get_sdfg("test_3d_copy", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[::copy_strides[0], ::copy_strides[1], ::copy_strides[2]]
+    cp.testing.assert_array_equal(B, expected)
+
+    assert num_tasklets == 1
+
+    print(f"3D copy: {num_tasklets} tasklets")
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride", [2, 4])
+def test_1d_strided_copy(stride):
+    """Test 1D strided copy with varying strides."""
+    import cupy as cp
+    
+    dimension = (8,)
+    copy_strides = (stride,)
+    
+    sdfg = _get_sdfg(f"test_1d_strided_copy_s{stride}", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+    
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+    
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+    
+    sdfg(A=A, B=B)
+    
+    # Verify correctness - only elements at stride intervals should be copied
+    expected = cp.zeros_like(A)
+    expected[::stride] = A[::stride]
+    cp.testing.assert_array_equal(B[::stride], expected[::stride])
+    
+    print(f"1D strided copy (stride={stride}): {num_tasklets} tasklets")
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride_1,stride_2", [(2, 1), (4, 1), (1, 2), (1, 4)])
+def test_2d_strided_copy(stride_1, stride_2):
+    """Test 2D strided copy. First dimension is unit stride, second is strided."""
+    import cupy as cp
+
+    dimension = (8, 4)
+    copy_strides = (stride_1, stride_2)
+
+    sdfg = _get_sdfg(f"test_2d_strided_copy_s{stride_1}_{stride_2}", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = cp.zeros_like(A)
+    expected[::stride_1, ::stride_2] = A[::stride_1, ::stride_2]
+    cp.testing.assert_array_equal(B[::stride_1, ::stride_2], expected[::stride_1, ::stride_2])
+
+    print(f"2D strided copy (strides={stride_1},{stride_2}): {num_tasklets} tasklets")
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride_1,stride_2,stride_3", [
+    (1, 2, 2), 
+    (1, 2, 4), 
+    (1, 4, 2),
+    (4, 1, 1), 
+    (4, 2, 1), 
+    (2, 2, 1)
+])
+def test_3d_strided_copy(stride_1, stride_2, stride_3):
+    """Test 3D strided copy. First dimension is unit stride, others are strided."""
+    import cupy as cp
+
+    dimension = (8, 4, 4)
+    copy_strides = (stride_1, stride_2, stride_3)
+
+    sdfg = _get_sdfg(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}", 
+                     dimension, copy_strides)
+    sdfg.save("x1.sdfg")
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+    sdfg.save("x2.sdfg")
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = cp.zeros_like(A)
+    expected[::stride_1, ::stride_2, ::stride_3] = A[::stride_1, ::stride_2, ::stride_3]
+    cp.testing.assert_array_equal(B, expected)
+
+    print(f"3D strided copy (strides={stride_1},{stride_2},{stride_3}): {num_tasklets} tasklets")
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride_1,stride_2,stride_3", [
+    (1, 2, 2), 
+    (1, 2, 4), 
+    (1, 4, 2),
+    (2, 2, 1),
+])
+def test_3d_strided_copy_w_other_subset(stride_1, stride_2, stride_3):
+    """Test 3D strided copy. First dimension is unit stride, others are strided."""
+    import cupy as cp
+
+    dimension = (8, 8, 8)
+    copy_strides = (stride_1, stride_2, stride_3)
+
+    sdfg = _get_sdfg_with_other_subset(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}_w_other_subset", 
+                     dimension, copy_strides)
+    sdfg.save("pre.sdfg")
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 0
+    num_nsdfgs = _count_nsdfgs(sdfg)
+    assert num_nsdfgs == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    print(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]])
+    print(A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]])
+    cp.testing.assert_array_equal(
+        B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]],
+        A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]]
+    )
+    print(f"3D strided copy (strides={stride_1},{stride_2},{stride_3}): {num_tasklets} tasklets")
\ No newline at end of file

From e483a25e4f76811773cf58176af94f2033113cde Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Tue, 6 Jan 2026 15:26:55 +0100
Subject: [PATCH 02/14] Refactor, use views+

---
 .../copy_strategies.py                        |  4 +-
 .../gpu_specialization_utilities/gpu_utils.py |  2 +-
 ...nsert_explicit_gpu_global_memory_copies.py | 86 ++++++++++---------
 .../explicit_global_memory_copy_test.py       | 70 +++++++--------
 4 files changed, 80 insertions(+), 82 deletions(-)

diff --git a/dace/codegen/gpu_specialization_utilities/copy_strategies.py b/dace/codegen/gpu_specialization_utilities/copy_strategies.py
index 685d0c11db..518e27c21f 100644
--- a/dace/codegen/gpu_specialization_utilities/copy_strategies.py
+++ b/dace/codegen/gpu_specialization_utilities/copy_strategies.py
@@ -96,7 +96,7 @@ def get_assigned_gpustream(self) -> str:
         # 2. Generate GPU stream expression
         gpustream = src_stream
         # gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
-        gpustream_expr = f"{gpustream}" # {gpustream_var_name_prefix}
+        gpustream_expr = f"{gpustream}"  # {gpustream_var_name_prefix}
 
         return gpustream_expr
 
@@ -546,4 +546,4 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None:
             call += "\n}"
 
         # Return the code
-        return call
\ No newline at end of file
+        return call
diff --git a/dace/codegen/gpu_specialization_utilities/gpu_utils.py b/dace/codegen/gpu_specialization_utilities/gpu_utils.py
index 4c742aaee5..b02340b338 100644
--- a/dace/codegen/gpu_specialization_utilities/gpu_utils.py
+++ b/dace/codegen/gpu_specialization_utilities/gpu_utils.py
@@ -24,4 +24,4 @@ def generate_sync_debug_call() -> str:
         sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
                      f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")
 
-    return sync_call
\ No newline at end of file
+    return sync_call
diff --git a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
index 48ee409f39..7c674e123c 100644
--- a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
+++ b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
@@ -4,18 +4,15 @@
 
 import dace
 from dace import SDFG, SDFGState, dtypes, properties
-from dace import memlet as mm
 from dace.codegen.gpu_specialization_utilities.copy_strategies import CopyContext, OutOfKernelCopyStrategy
 from dace.config import Config
 from dace.sdfg import nodes, scope_contains_scope
-from dace.sdfg.graph import MultiConnectorEdge
+from dace.sdfg.graph import Edge, MultiConnectorEdge
 from dace.transformation import pass_pipeline as ppl, transformation
 
 
-def create_sdfg_with_copy_only(parent_state, parent_src_node, parent_dst_node, edge) -> dace.SDFG:
-    sdfg = dace.SDFG("copy_subsdfg")
-    state = sdfg.add_state("copy_state", is_start_block=True)
-
+def create_viewed_copy_kernel(parent_state: dace.SDFGState, src_node: dace.nodes.AccessNode,
+                              dst_node: dace.nodes.AccessNode, edge: Edge[dace.Memlet]) -> dace.SDFG:
     # Currently only 1D and 2D copies are supported
     map_ranges = dict()
     for i, dim in enumerate(edge.data.subset):
@@ -23,17 +20,13 @@ def create_sdfg_with_copy_only(parent_state, parent_src_node, parent_dst_node, e
 
     access_expr = ",".join(f"i{i}" for i in range(len(edge.data.subset)))
 
-    src_desc = parent_state.sdfg.arrays[parent_src_node.data]
-    dst_desc = parent_state.sdfg.arrays[parent_dst_node.data]
-
-    # In nested SDFG we add "view_" prefix
-    src_node = state.add_access("view_" + parent_src_node.data)
-    dst_node = state.add_access("view_" + parent_dst_node.data)
+    src_desc = parent_state.sdfg.arrays[src_node.data]
+    dst_desc = parent_state.sdfg.arrays[dst_node.data]
 
     # Add new arrays for the copy SDFG
     # Determine src and dst subsets
-    src_subset = edge.data.subset if edge.data.data == parent_src_node.data else edge.data.other_subset
-    dst_subset = edge.data.other_subset if edge.data.data == parent_src_node.data else edge.data.subset
+    src_subset = edge.data.subset if edge.data.data == src_node.data else edge.data.other_subset
+    dst_subset = edge.data.other_subset if edge.data.data == src_node.data else edge.data.subset
 
     # Collect the new shapes
     src_shape = [e + 1 - b for b, e, s in src_subset]
@@ -43,31 +36,37 @@ def create_sdfg_with_copy_only(parent_state, parent_src_node, parent_dst_node, e
     src_strides = src_desc.strides
     dst_strides = dst_desc.strides
 
+    _, src_view = parent_state.sdfg.add_view("view_" + src_node.data, src_shape, src_desc.dtype, src_desc.storage,
+                                             src_strides)
+    _, dst_view = parent_state.sdfg.add_view("view_" + dst_node.data, dst_shape, dst_desc.dtype, dst_desc.storage,
+                                             dst_strides)
+
+    # In nested SDFG we add "view_" prefix
+    view_src_node = parent_state.add_access("view_" + src_node.data)
+    view_dst_node = parent_state.add_access("view_" + dst_node.data)
+
     # Create string subset expressions to return
     src_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in src_subset])
     dst_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in dst_subset])
 
-    # Add arrays as views
-    sdfg.add_array("view_" + parent_src_node.data, src_shape, src_desc.dtype, src_desc.storage, None, False, src_strides)
-    sdfg.add_array("view_" + parent_dst_node.data, dst_shape, dst_desc.dtype, dst_desc.storage, None, False, dst_strides)
-
     # Add copy kernel
-    tasklet, map_entry, map_exit = state.add_mapped_tasklet(
+    tasklet, map_entry, map_exit = parent_state.add_mapped_tasklet(
         name="gpu_copy_kernel_fallback",
         map_ranges=map_ranges,
-        inputs={"_in": dace.memlet.Memlet(f"{src_node.data}[{access_expr}]")},
-        outputs={"_out": dace.memlet.Memlet(f"{dst_node.data}[{access_expr}]")},
+        inputs={"_in": dace.memlet.Memlet(f"{view_src_node.data}[{access_expr}]")},
+        outputs={"_out": dace.memlet.Memlet(f"{view_dst_node.data}[{access_expr}]")},
         code="_out = _in",
         schedule=dtypes.ScheduleType.GPU_Device,
         unroll_map=False,
         language=dtypes.Language.Python,
         external_edges=True,
         propagate=True,
-        input_nodes={src_node.data: src_node},
-        output_nodes={dst_node.data: dst_node},
+        input_nodes={view_src_node.data: view_src_node},
+        output_nodes={view_dst_node.data: view_dst_node},
     )
 
-    return sdfg, src_subset_expr, dst_subset_expr
+    return view_src_node, src_subset_expr, view_dst_node, dst_subset_expr
+
 
 @properties.make_properties
 @transformation.explicit_cf_compatible
@@ -139,9 +138,10 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
             # kernel (i.e. a GPU_device scheduled map)
             if not out_of_kernel_copy.applicable(copy_context):
                 continue
-            
+
             # If the subset has more than 2 dimensions and is not contiguous (represented as a 1D memcpy) then fallback to a copy kernel
-            if len(edge.data.subset) > 2 and not edge.data.subset.is_contiguous_subset(state.sdfg.arrays[edge.data.data]):
+            if len(edge.data.subset) > 2 and not edge.data.subset.is_contiguous_subset(
+                    state.sdfg.arrays[edge.data.data]):
 
                 # If other subset is not None, we do not need a nested SDFG
                 if edge.data.other_subset is None:
@@ -168,15 +168,13 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
                     # Add connectors to the out edge of map_entry and in edge of map_exit
                     state.remove_edge(edge)
                 else:
-                    copy_sdfg, src_subset_expr, dst_subset_expr = create_sdfg_with_copy_only(state, src_node, dst_node, edge)
-                    nsdfg = state.add_nested_sdfg(
-                        sdfg=copy_sdfg,
-                        inputs={"view_" + src_node.data},
-                        outputs={"view_" + dst_node.data},
-                    )
+                    view_src_node, src_subset_expr, view_dst_node, dst_subset_expr = create_viewed_copy_kernel(
+                        state, src_node, dst_node, edge)
                     state.remove_edge(edge)
-                    state.add_edge(src_node, None, nsdfg, "view_" + src_node.data, dace.Memlet(f"{src_node.data}[{src_subset_expr}]"))
-                    state.add_edge(nsdfg, "view_" + dst_node.data, dst_node, None, dace.Memlet(f"{dst_node.data}[{dst_subset_expr}]"))
+                    state.add_edge(src_node, None, view_src_node, "views",
+                                   dace.Memlet(f"{src_node.data}[{src_subset_expr}]"))
+                    state.add_edge(view_dst_node, "views", dst_node, None,
+                                   dace.Memlet(f"{dst_node.data}[{dst_subset_expr}]"))
             else:
                 # Generatae the copy call
                 code = out_of_kernel_copy.generate_copy(copy_context)
@@ -184,7 +182,9 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
                 # Prepare GPU ustream connectors and the stream to be accessed from the
                 # GPU stream array
                 # Create the tasklet and add GPU stream related connectors
-                tasklet = state.add_tasklet("gpu_copy", { "_in_" + src_node.data }, { "_out_" + dst_node.data }, code, language=dtypes.Language.CPP)
+                tasklet = state.add_tasklet("gpu_copy", {"_in_" + src_node.data}, {"_out_" + dst_node.data},
+                                            code,
+                                            language=dtypes.Language.CPP)
 
                 # Put the tasklet in between the edge
                 dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge
@@ -198,14 +198,16 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
                 else:
                     src_subset = memlet.subset if edge.data.data == src_node.data else memlet.other_subset
                     dst_subset = memlet.other_subset if edge.data.data == src_node.data else memlet.subset
-                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, dace.Memlet(data=src_node.data, subset=src_subset))
-                    state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn, dace.Memlet(data=dst_node.data, subset=dst_subset))
+                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data,
+                                   dace.Memlet(data=src_node.data, subset=src_subset))
+                    state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn,
+                                   dace.Memlet(data=dst_node.data, subset=dst_subset))
                     state.remove_edge(edge)
 
         return {}
 
     def find_all_data_copies(
-            self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]:
+            self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[dace.Memlet]]]:
         """
         Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node,
         destination node, and the first memlet edge of in the memlet path between source and destination node.
@@ -217,7 +219,7 @@ def find_all_data_copies(
 
         Returns
         -------
-        List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]
+        List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[dace.Memlet]]]
             A list of tuples representing the data copy, each containing:
             - The SDFG containing the copy
             - The state in which the copy occurs
@@ -225,8 +227,8 @@ def find_all_data_copies(
             - The destination node of the copy
             - The first memlet edge representing the data movement
         """
-        copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = []
-        visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set()
+        copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[dace.Memlet]]] = []
+        visited_edges: Set[MultiConnectorEdge[dace.Memlet]] = set()
 
         for sub_sdfg in sdfg.all_sdfgs_recursive():
             for state in sub_sdfg.states():
@@ -253,4 +255,4 @@ def find_all_data_copies(
                     # Add copy to the worklist
                     copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge))
 
-        return copy_worklist
\ No newline at end of file
+        return copy_worklist
diff --git a/tests/gpu_specialization/explicit_global_memory_copy_test.py b/tests/gpu_specialization/explicit_global_memory_copy_test.py
index 989c1575d0..a44d556a2f 100644
--- a/tests/gpu_specialization/explicit_global_memory_copy_test.py
+++ b/tests/gpu_specialization/explicit_global_memory_copy_test.py
@@ -5,6 +5,7 @@
 from typing import Tuple
 from dace.transformation.passes.gpu_specialization.insert_explicit_gpu_global_memory_copies import InsertExplicitGPUGlobalMemoryCopies
 
+
 def _get_sdfg(name_str: str, dimension: Tuple[int], copy_strides: Tuple[int]) -> dace.SDFG:
     sdfg = dace.SDFG(name_str)
     state = sdfg.add_state("state0", is_start_block=True)
@@ -26,12 +27,14 @@ def _get_sdfg_with_other_subset(name_str: str, dimension: Tuple[int], copy_strid
     a = state.add_access("A")
     b = state.add_access("B")
     # copy_str = ", ".join([f"0:{dimension[i]}:{copy_strides[i]}" for i in range(len(dimension))])
-    src_subset = dace.subsets.Range([((dimension[i] // 2), dimension[i] - 1, copy_strides[i]) for i in range(len(dimension))])
+    src_subset = dace.subsets.Range([((dimension[i] // 2), dimension[i] - 1, copy_strides[i])
+                                     for i in range(len(dimension))])
     dst_subset = dace.subsets.Range([(0, (dimension[i] // 2) - 1, copy_strides[i]) for i in range(len(dimension))])
     state.add_edge(a, None, b, None, dace.Memlet(data="B", subset=dst_subset, other_subset=src_subset))
     sdfg.validate()
     return sdfg
 
+
 def _count_tasklets(sdfg: dace.SDFG) -> int:
     """Count the number of tasklets in the SDFG."""
     count = 0
@@ -41,6 +44,7 @@ def _count_tasklets(sdfg: dace.SDFG) -> int:
                 count += 1
     return count
 
+
 def _count_nsdfgs(sdfg: dace.SDFG) -> int:
     """Count the number of nested SDFGs in the SDFG."""
     count = 0
@@ -50,13 +54,14 @@ def _count_nsdfgs(sdfg: dace.SDFG) -> int:
                 count += 1
     return count
 
+
 @pytest.mark.gpu
 def test_1d_copy():
     """Test 1D unit stride copy."""
     import cupy as cp
 
-    dimension = (8,)
-    copy_strides = (1,)
+    dimension = (8, )
+    copy_strides = (1, )
 
     sdfg = _get_sdfg("test_1d_copy", dimension, copy_strides)
     InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
@@ -81,8 +86,8 @@ def test_1d_copy_w_other_subset():
     """Test 1D unit stride copy."""
     import cupy as cp
 
-    dimension = (8,)
-    copy_strides = (1,)
+    dimension = (8, )
+    copy_strides = (1, )
 
     sdfg = _get_sdfg_with_other_subset("test_1d_copy_w_other_subset", dimension, copy_strides)
     InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
@@ -132,6 +137,7 @@ def test_2d_copy():
 
     print(f"2D copy: {num_tasklets} tasklets")
 
+
 @pytest.mark.gpu
 def test_2d_copy_with_other_subset():
     """Test 2D unit stride copy with other subset not None."""
@@ -159,6 +165,7 @@ def test_2d_copy_with_other_subset():
 
     print(f"2D copy: {num_tasklets} tasklets")
 
+
 @pytest.mark.gpu
 def test_3d_copy():
     """Test 3D unit stride copy."""
@@ -193,28 +200,28 @@ def test_3d_copy():
 def test_1d_strided_copy(stride):
     """Test 1D strided copy with varying strides."""
     import cupy as cp
-    
-    dimension = (8,)
-    copy_strides = (stride,)
-    
+
+    dimension = (8, )
+    copy_strides = (stride, )
+
     sdfg = _get_sdfg(f"test_1d_strided_copy_s{stride}", dimension, copy_strides)
     InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
-    
+
     # Count tasklets
     num_tasklets = _count_tasklets(sdfg)
     assert num_tasklets == 1
-    
+
     # Test with cupy
     A = cp.random.rand(*dimension).astype(np.float32)
     B = cp.zeros_like(A)
-    
+
     sdfg(A=A, B=B)
-    
+
     # Verify correctness - only elements at stride intervals should be copied
     expected = cp.zeros_like(A)
     expected[::stride] = A[::stride]
     cp.testing.assert_array_equal(B[::stride], expected[::stride])
-    
+
     print(f"1D strided copy (stride={stride}): {num_tasklets} tasklets")
 
 
@@ -249,14 +256,8 @@ def test_2d_strided_copy(stride_1, stride_2):
 
 
 @pytest.mark.gpu
-@pytest.mark.parametrize("stride_1,stride_2,stride_3", [
-    (1, 2, 2), 
-    (1, 2, 4), 
-    (1, 4, 2),
-    (4, 1, 1), 
-    (4, 2, 1), 
-    (2, 2, 1)
-])
+@pytest.mark.parametrize("stride_1,stride_2,stride_3", [(1, 2, 2), (1, 2, 4), (1, 4, 2), (4, 1, 1), (4, 2, 1),
+                                                        (2, 2, 1)])
 def test_3d_strided_copy(stride_1, stride_2, stride_3):
     """Test 3D strided copy. First dimension is unit stride, others are strided."""
     import cupy as cp
@@ -264,8 +265,7 @@ def test_3d_strided_copy(stride_1, stride_2, stride_3):
     dimension = (8, 4, 4)
     copy_strides = (stride_1, stride_2, stride_3)
 
-    sdfg = _get_sdfg(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}", 
-                     dimension, copy_strides)
+    sdfg = _get_sdfg(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}", dimension, copy_strides)
     sdfg.save("x1.sdfg")
     InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
     sdfg.save("x2.sdfg")
@@ -287,10 +287,11 @@ def test_3d_strided_copy(stride_1, stride_2, stride_3):
 
     print(f"3D strided copy (strides={stride_1},{stride_2},{stride_3}): {num_tasklets} tasklets")
 
+
 @pytest.mark.gpu
 @pytest.mark.parametrize("stride_1,stride_2,stride_3", [
-    (1, 2, 2), 
-    (1, 2, 4), 
+    (1, 2, 2),
+    (1, 2, 4),
     (1, 4, 2),
     (2, 2, 1),
 ])
@@ -301,16 +302,13 @@ def test_3d_strided_copy_w_other_subset(stride_1, stride_2, stride_3):
     dimension = (8, 8, 8)
     copy_strides = (stride_1, stride_2, stride_3)
 
-    sdfg = _get_sdfg_with_other_subset(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}_w_other_subset", 
-                     dimension, copy_strides)
-    sdfg.save("pre.sdfg")
+    sdfg = _get_sdfg_with_other_subset(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}_w_other_subset",
+                                       dimension, copy_strides)
     InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
 
     # Count tasklets
     num_tasklets = _count_tasklets(sdfg)
-    assert num_tasklets == 0
-    num_nsdfgs = _count_nsdfgs(sdfg)
-    assert num_nsdfgs == 1
+    assert num_tasklets == 1
 
     # Test with cupy
     A = cp.random.rand(*dimension).astype(np.float32)
@@ -321,8 +319,6 @@ def test_3d_strided_copy_w_other_subset(stride_1, stride_2, stride_3):
     # Verify correctness
     print(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]])
     print(A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]])
-    cp.testing.assert_array_equal(
-        B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]],
-        A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]]
-    )
-    print(f"3D strided copy (strides={stride_1},{stride_2},{stride_3}): {num_tasklets} tasklets")
\ No newline at end of file
+    cp.testing.assert_array_equal(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]],
+                                  A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]])
+    print(f"3D strided copy (strides={stride_1},{stride_2},{stride_3}): {num_tasklets} tasklets")

From 4d7156f47da60b998d038597fbf3038d4da91476 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Tue, 6 Jan 2026 15:35:17 +0100
Subject: [PATCH 03/14] Refactor

---
 .../explicit_global_memory_copy_test.py           | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tests/gpu_specialization/explicit_global_memory_copy_test.py b/tests/gpu_specialization/explicit_global_memory_copy_test.py
index a44d556a2f..d45f421c2a 100644
--- a/tests/gpu_specialization/explicit_global_memory_copy_test.py
+++ b/tests/gpu_specialization/explicit_global_memory_copy_test.py
@@ -135,8 +135,6 @@ def test_2d_copy():
 
     assert num_tasklets == 1
 
-    print(f"2D copy: {num_tasklets} tasklets")
-
 
 @pytest.mark.gpu
 def test_2d_copy_with_other_subset():
@@ -163,8 +161,6 @@ def test_2d_copy_with_other_subset():
     cp.testing.assert_array_equal(B[0:4, 0:4], expected)
     assert num_tasklets == 1
 
-    print(f"2D copy: {num_tasklets} tasklets")
-
 
 @pytest.mark.gpu
 def test_3d_copy():
@@ -192,8 +188,6 @@ def test_3d_copy():
 
     assert num_tasklets == 1
 
-    print(f"3D copy: {num_tasklets} tasklets")
-
 
 @pytest.mark.gpu
 @pytest.mark.parametrize("stride", [2, 4])
@@ -222,8 +216,6 @@ def test_1d_strided_copy(stride):
     expected[::stride] = A[::stride]
     cp.testing.assert_array_equal(B[::stride], expected[::stride])
 
-    print(f"1D strided copy (stride={stride}): {num_tasklets} tasklets")
-
 
 @pytest.mark.gpu
 @pytest.mark.parametrize("stride_1,stride_2", [(2, 1), (4, 1), (1, 2), (1, 4)])
@@ -252,8 +244,6 @@ def test_2d_strided_copy(stride_1, stride_2):
     expected[::stride_1, ::stride_2] = A[::stride_1, ::stride_2]
     cp.testing.assert_array_equal(B[::stride_1, ::stride_2], expected[::stride_1, ::stride_2])
 
-    print(f"2D strided copy (strides={stride_1},{stride_2}): {num_tasklets} tasklets")
-
 
 @pytest.mark.gpu
 @pytest.mark.parametrize("stride_1,stride_2,stride_3", [(1, 2, 2), (1, 2, 4), (1, 4, 2), (4, 1, 1), (4, 2, 1),
@@ -285,8 +275,6 @@ def test_3d_strided_copy(stride_1, stride_2, stride_3):
     expected[::stride_1, ::stride_2, ::stride_3] = A[::stride_1, ::stride_2, ::stride_3]
     cp.testing.assert_array_equal(B, expected)
 
-    print(f"3D strided copy (strides={stride_1},{stride_2},{stride_3}): {num_tasklets} tasklets")
-
 
 @pytest.mark.gpu
 @pytest.mark.parametrize("stride_1,stride_2,stride_3", [
@@ -317,8 +305,5 @@ def test_3d_strided_copy_w_other_subset(stride_1, stride_2, stride_3):
     sdfg(A=A, B=B)
 
     # Verify correctness
-    print(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]])
-    print(A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]])
     cp.testing.assert_array_equal(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]],
                                   A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]])
-    print(f"3D strided copy (strides={stride_1},{stride_2},{stride_3}): {num_tasklets} tasklets")

From 09e29e66fbe64baf363d83bf47db168a08d35f03 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Tue, 6 Jan 2026 16:02:43 +0100
Subject: [PATCH 04/14] Refactor

---
 .../gpu_specialization/helpers}/__init__.py      |  0
 .../helpers}/copy_strategies.py                  |  8 ++------
 .../gpu_specialization/helpers/gpu_helpers.py}   |  0
 .../insert_explicit_gpu_global_memory_copies.py  | 16 ++++++++--------
 4 files changed, 10 insertions(+), 14 deletions(-)
 rename dace/{codegen/gpu_specialization_utilities => transformation/passes/gpu_specialization/helpers}/__init__.py (100%)
 rename dace/{codegen/gpu_specialization_utilities => transformation/passes/gpu_specialization/helpers}/copy_strategies.py (99%)
 rename dace/{codegen/gpu_specialization_utilities/gpu_utils.py => transformation/passes/gpu_specialization/helpers/gpu_helpers.py} (100%)

diff --git a/dace/codegen/gpu_specialization_utilities/__init__.py b/dace/transformation/passes/gpu_specialization/helpers/__init__.py
similarity index 100%
rename from dace/codegen/gpu_specialization_utilities/__init__.py
rename to dace/transformation/passes/gpu_specialization/helpers/__init__.py
diff --git a/dace/codegen/gpu_specialization_utilities/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
similarity index 99%
rename from dace/codegen/gpu_specialization_utilities/copy_strategies.py
rename to dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
index 518e27c21f..506e9b6b99 100644
--- a/dace/codegen/gpu_specialization_utilities/copy_strategies.py
+++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
@@ -1,17 +1,13 @@
 # Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Tuple, Union
-
+from typing import Any, Dict, Tuple, Union
 from dace import SDFG, SDFGState, data, dtypes, subsets
 from dace import memlet as mm
-from dace import symbolic
 from dace.codegen import common
 from dace.codegen.targets import cpp
 from dace.codegen.targets.cpp import sym2cpp
-from dace.codegen.gpu_specialization_utilities.gpu_utils import generate_sync_debug_call
-from dace.config import Config
+from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import generate_sync_debug_call
 from dace.dtypes import StorageType
-from dace.frontend import operations
 from dace.sdfg import nodes, scope_contains_scope
 from dace.sdfg.graph import MultiConnectorEdge
 from dace.transformation import helpers
diff --git a/dace/codegen/gpu_specialization_utilities/gpu_utils.py b/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py
similarity index 100%
rename from dace/codegen/gpu_specialization_utilities/gpu_utils.py
rename to dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py
diff --git a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
index 7c674e123c..359fec7bae 100644
--- a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
+++ b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
@@ -4,9 +4,7 @@
 
 import dace
 from dace import SDFG, SDFGState, dtypes, properties
-from dace.codegen.gpu_specialization_utilities.copy_strategies import CopyContext, OutOfKernelCopyStrategy
-from dace.config import Config
-from dace.sdfg import nodes, scope_contains_scope
+from dace.transformation.passes.gpu_specialization.helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy
 from dace.sdfg.graph import Edge, MultiConnectorEdge
 from dace.transformation import pass_pipeline as ppl, transformation
 
@@ -117,8 +115,8 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
         """
         # Prepare GPU stream
 
-        # gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
-        gpustream_assignments: Dict[nodes.Node, Union[int, str]] = dict()
+        # gpustream_assignments: Dict[dace.nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
+        gpustream_assignments: Dict[dace.nodes.Node, Union[int, str]] = dict()
 
         # Initialize the strategy for copies that occur outside of kernel execution
         out_of_kernel_copy = OutOfKernelCopyStrategy()
@@ -207,7 +205,8 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
         return {}
 
     def find_all_data_copies(
-            self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[dace.Memlet]]]:
+            self, sdfg: SDFG
+    ) -> List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node, MultiConnectorEdge[dace.Memlet]]]:
         """
         Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node,
         destination node, and the first memlet edge of in the memlet path between source and destination node.
@@ -219,7 +218,7 @@ def find_all_data_copies(
 
         Returns
         -------
-        List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[dace.Memlet]]]
+        List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node, MultiConnectorEdge[dace.Memlet]]]
             A list of tuples representing the data copy, each containing:
             - The SDFG containing the copy
             - The state in which the copy occurs
@@ -227,7 +226,8 @@ def find_all_data_copies(
             - The destination node of the copy
             - The first memlet edge representing the data movement
         """
-        copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[dace.Memlet]]] = []
+        copy_worklist: List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node,
+                                  MultiConnectorEdge[dace.Memlet]]] = []
         visited_edges: Set[MultiConnectorEdge[dace.Memlet]] = set()
 
         for sub_sdfg in sdfg.all_sdfgs_recursive():

From 8a754c3d1c08c82e0ee42401cce21e49f2531bcf Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Wed, 7 Jan 2026 13:02:43 +0100
Subject: [PATCH 05/14] Extensions

---
 dace/dtypes.py                                | 14 ++++++++++--
 ...nsert_explicit_gpu_global_memory_copies.py |  4 +++-
 .../explicit_global_memory_copy_test.py       | 22 +++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/dace/dtypes.py b/dace/dtypes.py
index faadc84a50..1ca30420c7 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -77,6 +77,7 @@ class ScheduleType(aenum.AutoNumberEnum):
     Snitch = ()
     Snitch_Multicore = ()
     FPGA_Multi_Pumped = ()  #: Used for double pumping
+    GPU_Warp = ()
 
 
 # A subset of GPU schedule types
@@ -87,6 +88,11 @@ class ScheduleType(aenum.AutoNumberEnum):
     ScheduleType.GPU_Persistent,
 ]
 
+# A subset of GPU schedule types for ExperimentalCUDACodeGen
+EXPERIMENTAL_GPU_SCHEDULES = [
+    ScheduleType.GPU_Warp,
+]
+
 # A subset of CPU schedule types
 CPU_SCHEDULES = [
     ScheduleType.CPU_Multicore,
@@ -204,7 +210,8 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register,
     ScheduleType.FPGA_Device: StorageType.FPGA_Global,
     ScheduleType.SVE_Map: StorageType.CPU_Heap,
-    ScheduleType.Snitch: StorageType.Snitch_TCDM
+    ScheduleType.Snitch: StorageType.Snitch_TCDM,
+    ScheduleType.GPU_Warp: StorageType.Register,
 }
 
 # Maps from ScheduleType to default ScheduleType for sub-scopes
@@ -225,7 +232,8 @@ class TilingType(aenum.AutoNumberEnum):
     ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device,
     ScheduleType.SVE_Map: ScheduleType.Sequential,
     ScheduleType.Snitch: ScheduleType.Snitch,
-    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore
+    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore,
+    ScheduleType.GPU_Warp: ScheduleType.Sequential,
 }
 
 # Maps from StorageType to a preferred ScheduleType for helping determine schedules.
@@ -1266,6 +1274,7 @@ def isconstant(var):
 complex128 = typeclass(numpy.complex128)
 string = stringtype()
 MPI_Request = opaque('MPI_Request')
+gpuStream_t = opaque('gpuStream_t')
 
 
 @undefined_safe_enum
@@ -1286,6 +1295,7 @@ class Typeclasses(aenum.AutoNumberEnum):
     float64 = float64
     complex64 = complex64
     complex128 = complex128
+    gpuStream_t = gpuStream_t
 
 
 _bool = bool
diff --git a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
index 359fec7bae..2a74ac5235 100644
--- a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
+++ b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
@@ -188,7 +188,9 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
                 dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge
 
                 if memlet.other_subset is None:
-                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, copy.deepcopy(memlet))
+                    src_memlet = copy.deepcopy(memlet)
+                    src_memlet.data = src_node.data
+                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, src_memlet)
                     dst_memlet = copy.deepcopy(memlet)
                     dst_memlet.data = dst_node.data
                     state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn, dst_memlet)
diff --git a/tests/gpu_specialization/explicit_global_memory_copy_test.py b/tests/gpu_specialization/explicit_global_memory_copy_test.py
index d45f421c2a..92cefed48a 100644
--- a/tests/gpu_specialization/explicit_global_memory_copy_test.py
+++ b/tests/gpu_specialization/explicit_global_memory_copy_test.py
@@ -307,3 +307,25 @@ def test_3d_strided_copy_w_other_subset(stride_1, stride_2, stride_3):
     # Verify correctness
     cp.testing.assert_array_equal(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]],
                                   A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]])
+
+
+@pytest.mark.gpu
+def test_independent_copies():
+
+    @dace.program
+    def independent_copies(A: dace.uint32[128], B: dace.uint32[128], C: dace.uint32[128], D: dace.uint32[128]):
+        for i in dace.map[0:128:1]:
+            B[i] = A[i]
+        for i in dace.map[0:128:1]:
+            D[i] = C[i]
+
+    sdfg = independent_copies.to_sdfg()
+    sdfg.apply_gpu_transformations()
+    sdfg.validate()
+    sdfg.save("s1.sdfg")
+
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+    sdfg.save("s2.sdfg")
+
+    sdfg.validate()
+    sdfg.compile()

From dad01d32d49e8149952b8389ba369cd3e4ebc6c9 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Wed, 7 Jan 2026 13:03:04 +0100
Subject: [PATCH 06/14] Fix bug

---
 .../passes/gpu_specialization/helpers/copy_strategies.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
index 506e9b6b99..0523e7798e 100644
--- a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
+++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
@@ -188,7 +188,7 @@ def _expr_for(desc, name, subset):
 
             if isinstance(desc, data.Scalar):
                 # GPU scalar special-case
-                if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN:
+                if desc.storage in dtypes.GPU_STORAGES:
                     parent = state.sdfg.parent_nsdfg_node
                     if parent is not None and name in parent.in_connectors:
                         return f"&{ptr}"
@@ -313,7 +313,7 @@ def applicable(self, copy_context: CopyContext) -> bool:
         parent_map_tuple = helpers.get_parent_map(state, deeper_node)
         while parent_map_tuple is not None:
             parent_map, parent_state = parent_map_tuple
-            if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN:
+            if parent_map.map.schedule in dtypes.GPU_SCHEDULES + dtypes.EXPERIMENTAL_GPU_SCHEDULES:
                 return False
             else:
                 parent_map_tuple = helpers.get_parent_map(parent_state, parent_map)

From e38016cee466e003d870fc5bcae32494f7411137 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Wed, 7 Jan 2026 13:41:10 +0100
Subject: [PATCH 07/14] Fix

---
 .../helpers/copy_strategies.py                | 27 +++----------------
 .../gpu_specialization/helpers/gpu_helpers.py |  8 ++++++
 ...nsert_explicit_gpu_global_memory_copies.py | 10 ++-----
 3 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
index 0523e7798e..b511b30b2f 100644
--- a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
+++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
@@ -21,7 +21,7 @@ class CopyContext:
     """
 
     def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node,
-                 edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]):
+                 edge: MultiConnectorEdge[mm.Memlet]):
 
         # Store the basic context as attributes
         self.sdfg = sdfg
@@ -29,7 +29,6 @@ def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node:
         self.src_node = src_node
         self.dst_node = dst_node
         self.edge = edge
-        self.gpustream_assignments = gpustream_assignments
 
         memlet = edge.data
 
@@ -69,30 +68,12 @@ def get_storage_type(self, node: nodes.Node):
     def get_assigned_gpustream(self) -> str:
         """
         Return the GPU stream expression assigned to both source and destination nodes.
-
-        Ensures that both nodes have a matching stream ID, then constructs the
-        variable name from the configured prefix and stream ID. Raises ValueError
-        if assignments are missing or inconsistent.
-
-        Example:
-            If the configured prefix is 'gpu_stream' and the assigned stream ID is 0,
-            this method returns 'gpu_stream0'.
+        Defaults to `__dace_current_stream` placeholder, which can be changed by the scheduling pass
         """
-        src_stream = self.gpustream_assignments.get(self.src_node)
-        dst_stream = self.gpustream_assignments.get(self.dst_node)
-
-        # 1. Catch unsupported cases
-        if src_stream is None or dst_stream is None:
-            raise ValueError("GPU stream assignment missing for source or destination node.")
-
-        if src_stream != dst_stream:
-            raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', "
-                             f"dst_node has '{dst_stream}'. They must be the same.")
-
         # 2. Generate GPU stream expression
-        gpustream = src_stream
+        gpustream = "__state->gpu_context->streams[0]"
         # gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
-        gpustream_expr = f"{gpustream}"  # {gpustream_var_name_prefix}
+        gpustream_expr = gpustream
 
         return gpustream_expr
 
diff --git a/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py b/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py
index b02340b338..be9d510602 100644
--- a/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py
+++ b/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py
@@ -3,6 +3,14 @@
 from dace.codegen import common
 
 
+def get_gpu_stream_array_name() -> str:
+    return "gpu_streams"
+
+
+def get_gpu_stream_connector_name() -> str:
+    return "__stream_"
+
+
 def generate_sync_debug_call() -> str:
     """
     Generate backend sync and error-check calls as a string if
diff --git a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
index 2a74ac5235..34cd37de4a 100644
--- a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
+++ b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
@@ -7,6 +7,7 @@
 from dace.transformation.passes.gpu_specialization.helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy
 from dace.sdfg.graph import Edge, MultiConnectorEdge
 from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import get_gpu_stream_connector_name
 
 
 def create_viewed_copy_kernel(parent_state: dace.SDFGState, src_node: dace.nodes.AccessNode,
@@ -115,22 +116,15 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
         """
         # Prepare GPU stream
 
-        # gpustream_assignments: Dict[dace.nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']
-        gpustream_assignments: Dict[dace.nodes.Node, Union[int, str]] = dict()
-
         # Initialize the strategy for copies that occur outside of kernel execution
         out_of_kernel_copy = OutOfKernelCopyStrategy()
 
         # Get all data copies to process the out of kernel copies
         copy_worklist = self.find_all_data_copies(sdfg)
 
-        for copy_sdfg, state, src_node, dst_node, edge in copy_worklist:
-            gpustream_assignments[src_node] = "__dace_current_stream"
-            gpustream_assignments[dst_node] = "__dace_current_stream"
-
         for copy_sdfg, state, src_node, dst_node, edge in copy_worklist:
 
-            copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments)
+            copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge)
 
             # Only insert copy tasklets for GPU related copies occuring out of the
             # kernel (i.e. a GPU_device scheduled map)

From 944db27743ef0887ec263148943d5d1b2eead4be Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Wed, 7 Jan 2026 14:57:31 +0100
Subject: [PATCH 08/14] Check for GPU outputs in current stream generation

---
 dace/codegen/targets/cpp.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index b451668831..4192416505 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -899,9 +899,12 @@ def unparse_cr(sdfg, wcr_ast, dtype):
 def connected_to_gpu_memory(node: nodes.Node, state: SDFGState, sdfg: SDFG):
     for e in state.all_edges(node):
         path = state.memlet_path(e)
-        if ((isinstance(path[0].src, nodes.AccessNode)
-             and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)):
+        if (((isinstance(path[0].src, nodes.AccessNode)
+             and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)) or 
+            ((isinstance(path[-1].dst, nodes.AccessNode)
+              and path[-1].dst.desc(sdfg).storage is dtypes.StorageType.GPU_Global))):
             return True
+
     return False
 
 

From c77ea55ed1307ddfd63b9a6967ca24e4ceffe9f8 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Wed, 7 Jan 2026 15:42:20 +0100
Subject: [PATCH 09/14] Fix cpp codegen

---
 dace/codegen/targets/cpp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 4192416505..ac30732069 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -900,9 +900,9 @@ def connected_to_gpu_memory(node: nodes.Node, state: SDFGState, sdfg: SDFG):
     for e in state.all_edges(node):
         path = state.memlet_path(e)
         if (((isinstance(path[0].src, nodes.AccessNode)
-             and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)) or 
-            ((isinstance(path[-1].dst, nodes.AccessNode)
-              and path[-1].dst.desc(sdfg).storage is dtypes.StorageType.GPU_Global))):
+              and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global))
+                or ((isinstance(path[-1].dst, nodes.AccessNode)
+                     and path[-1].dst.desc(sdfg).storage is dtypes.StorageType.GPU_Global))):
             return True
 
     return False

From 0ed640600743dd070019e3e1e01699c23daf2244 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Fri, 9 Jan 2026 21:57:13 +0100
Subject: [PATCH 10/14] Precommit

---
 .../passes/gpu_specialization/helpers/copy_strategies.py       | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
index b511b30b2f..2d4e287562 100644
--- a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
+++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
@@ -71,8 +71,7 @@ def get_assigned_gpustream(self) -> str:
         Defaults to `__dace_current_stream` placeholder, which can be changed by the scheduling pass
         """
         # 2. Generate GPU stream expression
-        gpustream = "__state->gpu_context->streams[0]"
-        # gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1]
+        gpustream = "__dace_current_stream"
         gpustream_expr = gpustream
 
         return gpustream_expr

From bca117ca74ae7fcbd6486f1255b7f6acffd0dd2c Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Sat, 24 Jan 2026 16:46:51 +0100
Subject: [PATCH 11/14] Add dtype

---
 dace/dtypes.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dace/dtypes.py b/dace/dtypes.py
index 6b8dec7d98..1448d0c4db 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -101,6 +101,8 @@ class ScheduleType(AutoNumberEnum):
     StorageType.GPU_Shared,
 ]
 
+GPU_KERNEL_ACCESSIBLE_STORAGES = [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.Register]
+
 
 @undefined_safe_enum
 class ReductionType(AutoNumberEnum):
@@ -214,10 +216,10 @@ class TilingType(AutoNumberEnum):
     ScheduleType.GPU_Device: ScheduleType.GPU_ThreadBlock,
     ScheduleType.GPU_ThreadBlock: ScheduleType.Sequential,
     ScheduleType.GPU_ThreadBlock_Dynamic: ScheduleType.Sequential,
+    ScheduleType.GPU_Warp: ScheduleType.Sequential,
     ScheduleType.SVE_Map: ScheduleType.Sequential,
     ScheduleType.Snitch: ScheduleType.Snitch,
     ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore,
-    ScheduleType.GPU_Warp: ScheduleType.Sequential,
 }
 
 # Maps from StorageType to a preferred ScheduleType for helping determine schedules.
@@ -1248,6 +1250,7 @@ class string(_DaCeArray, npt.NDArray[numpy.str_]): ...
     class vector(_DaCeArray, npt.NDArray[numpy.void]): ...
     class MPI_Request(_DaCeArray, npt.NDArray[numpy.void]): ...
     class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
+    class gpuStream_t(_DaCeArray, npt.NDArray[numpy.void]): ...
     # yapf: enable
 else:
     # Runtime definitions
@@ -1268,6 +1271,7 @@ class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
     complex128 = typeclass(numpy.complex128)
     string = stringtype()
     MPI_Request = opaque('MPI_Request')
+    gpuStream_t = opaque('gpuStream_t')
     float32sr = Float32sr()
 
 
@@ -1517,6 +1521,8 @@ def can_access(schedule: ScheduleType, storage: StorageType):
             ScheduleType.GPU_Persistent,
             ScheduleType.GPU_ThreadBlock,
             ScheduleType.GPU_ThreadBlock_Dynamic,
+            ScheduleType.GPU_Default,
+            ScheduleType.GPU_Warp,
     ]:
         return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned]
     elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]:

From 9dd160589f54671d4cb85deecd2099f06ff446e2 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <ybudanaz@ethz.ch>
Date: Sat, 24 Jan 2026 17:05:53 +0100
Subject: [PATCH 12/14] Rm gpu def

---
 dace/dtypes.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dace/dtypes.py b/dace/dtypes.py
index 1448d0c4db..190d078d0f 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -1521,7 +1521,6 @@ def can_access(schedule: ScheduleType, storage: StorageType):
             ScheduleType.GPU_Persistent,
             ScheduleType.GPU_ThreadBlock,
             ScheduleType.GPU_ThreadBlock_Dynamic,
-            ScheduleType.GPU_Default,
             ScheduleType.GPU_Warp,
     ]:
         return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned]

From c9cc32dd0212b252d70bb17af5b940f71cbb9296 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 28 Jan 2026 17:44:21 +0100
Subject: [PATCH 13/14] Update
 dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py

Co-authored-by: alexnick83 <31545860+alexnick83@users.noreply.github.com>
---
 .../passes/gpu_specialization/helpers/copy_strategies.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
index 2d4e287562..1a06680b77 100644
--- a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
+++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
@@ -514,7 +514,7 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None:
         # Generate call and write it
         call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst}, {dpitch}, _in_{src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
 
-        # Potentially snychronization required if syncdebug is set to true in configurations
+        # Potentially synchronization required if syncdebug is set to true in configurations
         call += generate_sync_debug_call()
 
         # Write for-loop footers

From 6aa4c2761f9e6e8f4eb79a1cbd40f4b20bb5d334 Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 28 Jan 2026 17:46:26 +0100
Subject: [PATCH 14/14] Update
 dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py

Co-authored-by: alexnick83 <31545860+alexnick83@users.noreply.github.com>
---
 .../passes/gpu_specialization/helpers/copy_strategies.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
index 1a06680b77..1c22db4bd2 100644
--- a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
+++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
@@ -382,7 +382,7 @@ def _generate_1d_copy(self, copy_context: CopyContext) -> str:
 
             call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
 
-        # Potentially snychronization required if syncdebug is set to true in configurations
+        # Potentially synchronization required if syncdebug is set to true in configurations
         call = call + generate_sync_debug_call()
         return call