diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index c7ea6258a6..5c112d1e89 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -807,9 +807,12 @@ def unparse_cr(sdfg, wcr_ast, dtype):
 def connected_to_gpu_memory(node: nodes.Node, state: SDFGState, sdfg: SDFG):
     for e in state.all_edges(node):
         path = state.memlet_path(e)
-        if ((isinstance(path[0].src, nodes.AccessNode)
-             and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)):
+        if (((isinstance(path[0].src, nodes.AccessNode)
+              and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global))
+                or ((isinstance(path[-1].dst, nodes.AccessNode)
+                     and path[-1].dst.desc(sdfg).storage is dtypes.StorageType.GPU_Global))):
             return True
+
     return False
 
 
diff --git a/dace/dtypes.py b/dace/dtypes.py
index c2835d85a4..190d078d0f 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -71,6 +71,7 @@ class ScheduleType(AutoNumberEnum):
     GPU_ThreadBlock = ()  #: Thread-block code
     GPU_ThreadBlock_Dynamic = ()  #: Allows rescheduling work within a block
     GPU_Persistent = ()
+    GPU_Warp = ()
 
     Snitch = ()
     Snitch_Multicore = ()
@@ -84,6 +85,11 @@ class ScheduleType(AutoNumberEnum):
     ScheduleType.GPU_Persistent,
 ]
 
+# A subset of GPU schedule types for ExperimentalCUDACodeGen
+EXPERIMENTAL_GPU_SCHEDULES = [
+    ScheduleType.GPU_Warp,
+]
+
 # A subset of CPU schedule types
 CPU_SCHEDULES = [
     ScheduleType.CPU_Multicore,
@@ -95,6 +101,8 @@ class ScheduleType(AutoNumberEnum):
     StorageType.GPU_Shared,
 ]
 
+GPU_KERNEL_ACCESSIBLE_STORAGES = [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.Register]
+
 
 @undefined_safe_enum
 class ReductionType(AutoNumberEnum):
@@ -192,7 +200,8 @@ class TilingType(AutoNumberEnum):
     ScheduleType.GPU_ThreadBlock: StorageType.Register,
     ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register,
     ScheduleType.SVE_Map: StorageType.CPU_Heap,
-    ScheduleType.Snitch: StorageType.Snitch_TCDM
+    ScheduleType.Snitch: StorageType.Snitch_TCDM,
+    ScheduleType.GPU_Warp: StorageType.Register,
 }
 
 # Maps from ScheduleType to default ScheduleType for sub-scopes
@@ -207,9 +216,10 @@ class TilingType(AutoNumberEnum):
     ScheduleType.GPU_Device: ScheduleType.GPU_ThreadBlock,
     ScheduleType.GPU_ThreadBlock: ScheduleType.Sequential,
     ScheduleType.GPU_ThreadBlock_Dynamic: ScheduleType.Sequential,
+    ScheduleType.GPU_Warp: ScheduleType.Sequential,
     ScheduleType.SVE_Map: ScheduleType.Sequential,
     ScheduleType.Snitch: ScheduleType.Snitch,
-    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore
+    ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore,
 }
 
 # Maps from StorageType to a preferred ScheduleType for helping determine schedules.
@@ -1240,6 +1250,7 @@ class string(_DaCeArray, npt.NDArray[numpy.str_]): ...
     class vector(_DaCeArray, npt.NDArray[numpy.void]): ...
     class MPI_Request(_DaCeArray, npt.NDArray[numpy.void]): ...
     class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
+    class gpuStream_t(_DaCeArray, npt.NDArray[numpy.void]): ...
     # yapf: enable
 else:
     # Runtime definitions
@@ -1260,6 +1271,7 @@ class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
     complex128 = typeclass(numpy.complex128)
     string = stringtype()
     MPI_Request = opaque('MPI_Request')
+    gpuStream_t = opaque('gpuStream_t')
     float32sr = Float32sr()
 
 
@@ -1281,6 +1293,7 @@ class Typeclasses(AutoNumberEnum):
     float64 = float64
     complex64 = complex64
     complex128 = complex128
+    gpuStream_t = gpuStream_t
 
 
 _bool = bool
@@ -1508,6 +1521,7 @@ def can_access(schedule: ScheduleType, storage: StorageType):
             ScheduleType.GPU_Persistent,
             ScheduleType.GPU_ThreadBlock,
             ScheduleType.GPU_ThreadBlock_Dynamic,
+            ScheduleType.GPU_Warp,
     ]:
         return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned]
     elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]:
diff --git a/dace/transformation/passes/gpu_specialization/__init__.py b/dace/transformation/passes/gpu_specialization/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dace/transformation/passes/gpu_specialization/helpers/__init__.py b/dace/transformation/passes/gpu_specialization/helpers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
new file mode 100644
index 0000000000..1c22db4bd2
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/helpers/copy_strategies.py
@@ -0,0 +1,525 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Tuple, Union
+from dace import SDFG, SDFGState, data, dtypes, subsets
+from dace import memlet as mm
+from dace.codegen import common
+from dace.codegen.targets import cpp
+from dace.codegen.targets.cpp import sym2cpp
+from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import generate_sync_debug_call
+from dace.dtypes import StorageType
+from dace.sdfg import nodes, scope_contains_scope
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.transformation import helpers
+
+
+class CopyContext:
+    """
+    Encapsulates inputs required for copy operations and exposes helper
+    methods to derive additional information. This keeps copy strategies
+    lightweight by letting them focus only on the relevant logic.
+    """
+
+    def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node,
+                 edge: MultiConnectorEdge[mm.Memlet]):
+
+        # Store the basic context as attributes
+        self.sdfg = sdfg
+        self.state = state
+        self.src_node = src_node
+        self.dst_node = dst_node
+        self.edge = edge
+
+        memlet = edge.data
+
+        self.copy_shape = memlet.subset.size_exact()
+        if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode):
+            copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info()
+        else:
+            copy_shape = memlet.subset.size_exact()
+            src_strides = dst_strides = src_expr = dst_expr = None
+
+        self.copy_shape = copy_shape
+        self.src_strides = src_strides
+        self.dst_strides = dst_strides
+        self.src_expr = src_expr
+        self.dst_expr = dst_expr
+
+    def get_storage_type(self, node: nodes.Node):
+        """
+        Return the storage type associated with a given SDFG node.
+
+        Tasklets are assumed to use register storage, while AccessNodes
+        return the storage type from their data descriptor. Raises
+        NotImplementedError for unsupported node types.
+        """
+        if isinstance(node, nodes.Tasklet):
+            storage_type = StorageType.Register
+
+        elif isinstance(node, nodes.AccessNode):
+            storage_type = node.desc(self.sdfg).storage
+
+        else:
+            raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; "
+                                      "expected AccessNode or Tasklet. Please extend this method accordingly.")
+
+        return storage_type
+
+    def get_assigned_gpustream(self) -> str:
+        """
+        Return the GPU stream expression assigned to both source and destination nodes.
+        Defaults to `__dace_current_stream` placeholder, which can be changed by the scheduling pass
+        """
+        # 2. Generate GPU stream expression
+        gpustream = "__dace_current_stream"
+        gpustream_expr = gpustream
+
+        return gpustream_expr
+
+    def get_memory_location(self) -> Tuple[str, str]:
+        """
+        Determine whether the source and destination nodes reside in device or host memory.
+
+        Uses the storage type of each node to classify it as either 'Device'
+        (GPU global memory) or 'Host' (all other storage types).
+        Used for GPU related copies outside the kernel (e.g. to construct
+        cudaMemcpyHostToDevice for example).
+
+        Returns
+        -------
+        Tuple[str, str]
+            (src_location, dst_location) where each is either 'Device' or 'Host'.
+        """
+        src_storage = self.get_storage_type(self.src_node)
+        dst_storage = self.get_storage_type(self.dst_node)
+        src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host'
+        dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host'
+
+        return src_location, dst_location
+
+    def get_ctype(self) -> Any:
+        """
+        Determine the C data type (ctype) of the source or destination node.
+
+        The ctype is resolved from the data descriptor of the first node
+        (source or destination) that is an AccessNode (assumed to be the same
+        if both are AccessNodes).
+
+        Returns
+        -------
+        Any
+            The C type string (e.g., "float*", "int32") associated with the node.
+
+        Raises
+        ------
+        NotImplementedError
+            If neither the source nor the destination node is an AccessNode.
+        """
+        sdfg = self.sdfg
+        src_node, dst_node = self.src_node, self.dst_node
+
+        if isinstance(src_node, nodes.AccessNode):
+            return src_node.desc(sdfg).ctype
+
+        if isinstance(dst_node, nodes.AccessNode):
+            return dst_node.desc(sdfg).ctype
+
+        raise NotImplementedError(
+            f"Cannot determine ctype: neither src nor dst node is an AccessNode. "
+            f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. "
+            "Please extend this case or fix the issue.")
+
+    def get_accessnode_to_accessnode_copy_info(self):
+        """
+        Compute copy shape, absolute strides, and pointer expressions for a copy
+        between two AccessNodes. Tries to mimic
+        cpp.memlet_copy_to_absolute_strides without requiring a dispatcher.
+
+        Returns
+        -------
+        (copy_shape, src_strides, dst_strides, src_expr, dst_expr)
+
+        Raises
+        ------
+        TypeError
+            If either endpoint is not an AccessNode.
+        NotImplementedError
+            If a descriptor is not Scalar or Array.
+        """
+
+        # ---------------------------- helpers ----------------------------
+        def _collapse_strides(strides, subset):
+            """Remove size-1 dims; keep tile strides; default to [1] if none remain."""
+            n = len(subset)
+            collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1]
+            collapsed.extend(strides[n:])  # include tiles
+            if len(collapsed) == 0:
+                return [1]
+            return collapsed
+
+        def _ptr_name(desc, name):
+            if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent,
+                                                    dtypes.AllocationLifetime.External):
+                return f'__state->__{sdfg.cfg_id}_{name}'
+            return name
+
+        def _expr_for(desc, name, subset):
+            ptr = _ptr_name(desc, name)
+
+            if isinstance(desc, data.Scalar):
+                # GPU scalar special-case
+                if desc.storage in dtypes.GPU_STORAGES:
+                    parent = state.sdfg.parent_nsdfg_node
+                    if parent is not None and name in parent.in_connectors:
+                        return f"&{ptr}"
+                    return ptr
+                # CPU (or other) scalars
+                return f"&{ptr}"
+
+            if isinstance(desc, data.Array):
+                offset = cpp.cpp_offset_expr(desc, subset)
+                return f"{ptr} + {offset}" if offset != "0" else ptr
+
+            raise NotImplementedError(
+                f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.")
+
+        # ---------------------------- Get copy info ----------------------------
+        # Get needed information
+        src_node, dst_node = self.src_node, self.dst_node
+        sdfg, edge, state = self.sdfg, self.edge, self.state
+        memlet, copy_shape = self.edge.data, self.copy_shape
+
+        # Guard - only applicable if src and dst are AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            raise TypeError(
+                f"get_accessnode_to_accessnode_copy_info requires both source and destination "
+                f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.")
+
+        # Get node descriptors
+        src_nodedesc = src_node.desc(sdfg)
+        dst_nodedesc = dst_node.desc(sdfg)
+
+        # Resolve subsets (fallback to full range)
+        src_subset = memlet.get_src_subset(edge, state)
+        dst_subset = memlet.get_dst_subset(edge, state)
+
+        if src_subset is None:
+            src_subset = subsets.Range.from_array(src_nodedesc)
+
+        if dst_subset is None:
+            dst_subset = src_subset
+            # dst_subset = subsets.Range.from_array(dst_nodedesc)
+
+        # Get strides
+        src_strides = src_subset.absolute_strides(src_nodedesc.strides)
+        dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides)
+
+        # Try to convert to a degenerate/strided ND copy first
+        result = cpp.ndcopy_to_strided_copy(
+            copy_shape,
+            src_nodedesc.shape,
+            src_strides,
+            dst_nodedesc.shape,
+            dst_strides,
+            memlet.subset,
+            src_subset,
+            dst_subset,
+        )
+
+        if result is not None:
+            copy_shape, src_strides, dst_strides = result
+        else:
+            src_strides = _collapse_strides(src_strides, src_subset)
+            dst_strides = _collapse_strides(dst_strides, dst_subset)
+            copy_shape = [s for s in copy_shape if s != 1] or [1]
+
+        # Extend copy shape to the largest among the data dimensions,
+        # and extend other array with the appropriate strides
+        if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape):
+            if memlet.data == src_node.data:
+                copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape)
+            elif memlet.data == dst_node.data:
+                copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape)
+
+        return copy_shape, src_strides, dst_strides, src_node.data, dst_node.data
+
+
+class CopyStrategy(ABC):
+    """Abstract base class for memory copy strategies."""
+
+    @abstractmethod
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Return True if this strategy can handle the given memory copy.
+        """
+        raise NotImplementedError('Abstract class')
+
+    @abstractmethod
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates and returns the copy code for the supported pattern.
+        """
+        raise NotImplementedError('Abstract class')
+
+
+class OutOfKernelCopyStrategy(CopyStrategy):
+    """
+    Copy strategy for memory transfers that occur outside of kernel execution.
+
+    This pattern often occurs when generating host-to-device copies for kernel inputs
+    (since kernels cannot access host memory directly), and device-to-host copies
+    to retrieve results for further processing.
+    """
+
+    def applicable(self, copy_context: CopyContext) -> bool:
+        """
+        Determines whether the data movement is a host<->device memory copy.
+
+        This function returns True if:
+        - We are not currently generating kernel code
+        - The copy occurs between two AccessNodes
+        - The data descriptors of source and destination are not views.
+        - The storage types of either src or dst is CPU_Pinned or GPU_Device
+        - We do not have a CPU-to-CPU copy
+        """
+        # Retrieve needed information
+        state = copy_context.state
+        src_node, dst_node = copy_context.src_node, copy_context.dst_node
+
+        # 1. Ensure copy is not occuring within a kernel
+        scope_dict = state.scope_dict()
+        deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node
+
+        parent_map_tuple = helpers.get_parent_map(state, deeper_node)
+        while parent_map_tuple is not None:
+            parent_map, parent_state = parent_map_tuple
+            if parent_map.map.schedule in dtypes.GPU_SCHEDULES + dtypes.EXPERIMENTAL_GPU_SCHEDULES:
+                return False
+            else:
+                parent_map_tuple = helpers.get_parent_map(parent_state, parent_map)
+
+        # 2. Check whether copy is between two AccessNodes
+        if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)):
+            return False
+
+        # 3. The data descriptors of source and destination are not views
+        if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View):
+            return False
+
+        # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device
+        src_storage = copy_context.get_storage_type(src_node)
+        dst_storage = copy_context.get_storage_type(dst_node)
+        if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)
+                or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)):
+            return False
+
+        # 5. Check that this is not a CPU to CPU copy
+        cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned]
+        if src_storage in cpu_storage_types and dst_storage in cpu_storage_types:
+            return False
+
+        return True
+
+    def generate_copy(self, copy_context: CopyContext) -> str:
+        """Execute host-device copy with CUDA memory operations"""
+
+        # Guard
+        memlet = copy_context.edge.data
+        if memlet.wcr is not None:
+            src_location, dst_location = copy_context.get_memory_location()
+            raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented')
+
+        # Based on the copy dimension, call appropiate helper function
+        num_dims = len(copy_context.copy_shape)
+        if num_dims == 1:
+            copy_call = self._generate_1d_copy(copy_context)
+
+        elif num_dims == 2:
+            copy_call = self._generate_2d_copy(copy_context)
+
+        else:
+            # sanity check
+            assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}."
+            copy_call = self._generate_nd_copy(copy_context)
+
+        return copy_call
+
+    def _generate_1d_copy(self, copy_context: CopyContext) -> str:
+        """
+        Generates a 1D memory copy between host and device using the GPU backend.
+
+        Uses {backend}MemcpyAsync for contiguous memory. For strided memory,
+        {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension.
+        """
+        # ----------- Retrieve relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1)
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call --------------------
+
+        if is_contiguous_copy:
+            # Memory is linear: can use {backend}MemcpyAsync
+            copysize = ' * '.join(sym2cpp(copy_shape))
+            copysize += f' * sizeof({ctype})'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+            call = f'DACE_GPU_CHECK({backend}MemcpyAsync(_out_{dst_expr}, _in_{src_expr}, {copysize}, {kind}, {gpustream}));\n'
+
+        else:
+            # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch
+            # This allows copying a strided 1D region
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Potentially synchronization required if syncdebug is set to true in configurations
+        call = call + generate_sync_debug_call()
+        return call
+
+    def _generate_2d_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates a 2D memory copy using {backend}Memcpy2DAsync.
+
+        Three main cases are handled:
+        - Copy between row-major stored arrays with contiguous rows.
+        - Copy between column-major stored arrays with contiguous columns.
+        - A special case where a 2D copy can still be represented.
+
+        Raises:
+            NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns.
+            Such cases indicate an unsupported 2D copy and should be examined separately.
+            They can be implemented if valid, or a more descriptive error should be raised if the path should not occur.
+
+        Note:
+            {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column),
+            but not both simultaneously.
+        """
+
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+
+        # ----------------- Generate backend call if supported --------------------
+        # Case: Row-major layout, rows are not strided.
+        if (src_strides[1] == 1) and (dst_strides[1] == 1):
+            dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[0])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Case: Column-major layout, no columns are strided.
+        elif (src_strides[0] == 1) and (dst_strides[0] == 1):
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})'
+            height = f'{sym2cpp(copy_shape[1])}'
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Special case
+        elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]):
+            # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with
+            # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a
+            # {backend}Memcpy2DAsync call!
+
+            dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})'
+            spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})'
+            width = f'sizeof({ctype})'
+            height = sym2cpp(copy_shape[0] * copy_shape[1])
+            kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+            call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst_expr}, {dpitch}, _in_{src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        else:
+            raise NotImplementedError(
+                f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}."
+                "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken."
+            )
+
+        # Potentially snychronization required if syncdebug is set to true in configurations
+        call = call + generate_sync_debug_call()
+        return call
+
+    def _generate_nd_copy(self, copy_context: CopyContext) -> None:
+        """
+        Generates GPU code for copying N-dimensional arrays using 2D memory copies.
+
+        Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops
+        for any outer dimensions. Expects the copy to be contiguous and between
+        row-major storage locations.
+        """
+        # ----------- Extract relevant copy parameters --------------
+        backend: str = common.get_gpu_backend()
+
+        # Due to applicable(), src and dst node must be AccessNodes
+        copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info()
+
+        src_location, dst_location = copy_context.get_memory_location()
+        ctype = copy_context.get_ctype()
+        gpustream = copy_context.get_assigned_gpustream()
+        num_dims = len(copy_shape)
+
+        # ----------- Guard for unsupported Pattern --------------
+        if not (src_strides[-1] == 1) and (dst_strides[-1] == 1):
+            src_node, dst_node = copy_context.src_node, copy_context.dst_node
+            src_storage = copy_context.get_storage_type(src_node)
+            dst_storage = copy_context.get_storage_type(dst_node)
+            raise NotImplementedError(
+                "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n"
+                f"  Source node: {src_node} (storage: {src_storage})\n"
+                f"  Destination node: {copy_context.dst_node} (storage: {dst_storage})\n"
+                f"  Source strides: {src_strides}\n"
+                f"  Destination strides: {dst_strides}\n"
+                f"  copy shape: {copy_shape}\n")
+
+        # ----------------- Generate and write backend call(s) --------------------
+
+        call = ""
+        # Write for-loop headers
+        for dim in range(num_dims - 2):
+            call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n"
+
+        # Write Memcopy2DAsync
+        offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2]))
+        offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2]))
+
+        src = f'{src_expr} + {offset_src}'
+        dst = f'{dst_expr} + {offset_dst}'
+
+        dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})'
+        spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})'
+        width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})'
+        height = sym2cpp(copy_shape[-2])
+        kind = f'{backend}Memcpy{src_location}To{dst_location}'
+
+        # Generate call and write it
+        call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync(_out_{dst}, {dpitch}, _in_{src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n'
+
+        # Potentially synchronization required if syncdebug is set to true in configurations
+        call += generate_sync_debug_call()
+
+        # Write for-loop footers
+        for dim in range(num_dims - 2):
+            call += "\n}"
+
+        # Return the code
+        return call
diff --git a/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py b/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py
new file mode 100644
index 0000000000..be9d510602
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/helpers/gpu_helpers.py
@@ -0,0 +1,35 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+from dace import Config
+from dace.codegen import common
+
+
+def get_gpu_stream_array_name() -> str:
+    return "gpu_streams"
+
+
+def get_gpu_stream_connector_name() -> str:
+    return "__stream_"
+
+
+def generate_sync_debug_call() -> str:
+    """
+    Generate backend sync and error-check calls as a string if
+    synchronous debugging is enabled.
+
+    Parameters
+    ----------
+    backend : str
+        Backend API prefix (e.g., 'cuda').
+
+    Returns
+    -------
+    str
+        The generated debug call code, or an empty string if debugging is disabled.
+    """
+    backend: str = common.get_gpu_backend()
+    sync_call: str = ""
+    if Config.get_bool('compiler', 'cuda', 'syncdebug'):
+        sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
+                     f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")
+
+    return sync_call
diff --git a/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
new file mode 100644
index 0000000000..34cd37de4a
--- /dev/null
+++ b/dace/transformation/passes/gpu_specialization/insert_explicit_gpu_global_memory_copies.py
@@ -0,0 +1,254 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+from typing import Any, Dict, List, Set, Tuple, Type, Union
+import copy
+
+import dace
+from dace import SDFG, SDFGState, dtypes, properties
+from dace.transformation.passes.gpu_specialization.helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy
+from dace.sdfg.graph import Edge, MultiConnectorEdge
+from dace.transformation import pass_pipeline as ppl, transformation
+from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import get_gpu_stream_connector_name
+
+
+def create_viewed_copy_kernel(parent_state: dace.SDFGState, src_node: dace.nodes.AccessNode,
+                              dst_node: dace.nodes.AccessNode, edge: Edge[dace.Memlet]) -> dace.SDFG:
+    # Currently only 1D and 2D copies are supported
+    map_ranges = dict()
+    for i, dim in enumerate(edge.data.subset):
+        map_ranges[f"i{i}"] = f"0:{dim[1]+1-dim[0]}:{dim[2]}"
+
+    access_expr = ",".join(f"i{i}" for i in range(len(edge.data.subset)))
+
+    src_desc = parent_state.sdfg.arrays[src_node.data]
+    dst_desc = parent_state.sdfg.arrays[dst_node.data]
+
+    # Add new arrays for the copy SDFG
+    # Determine src and dst subsets
+    src_subset = edge.data.subset if edge.data.data == src_node.data else edge.data.other_subset
+    dst_subset = edge.data.other_subset if edge.data.data == src_node.data else edge.data.subset
+
+    # Collect the new shapes
+    src_shape = [e + 1 - b for b, e, s in src_subset]
+    dst_shape = [e + 1 - b for b, e, s in dst_subset]
+
+    # Preserve strides as-is
+    src_strides = src_desc.strides
+    dst_strides = dst_desc.strides
+
+    _, src_view = parent_state.sdfg.add_view("view_" + src_node.data, src_shape, src_desc.dtype, src_desc.storage,
+                                             src_strides)
+    _, dst_view = parent_state.sdfg.add_view("view_" + dst_node.data, dst_shape, dst_desc.dtype, dst_desc.storage,
+                                             dst_strides)
+
+    # In nested SDFG we add "view_" prefix
+    view_src_node = parent_state.add_access("view_" + src_node.data)
+    view_dst_node = parent_state.add_access("view_" + dst_node.data)
+
+    # Create string subset expressions to return
+    src_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in src_subset])
+    dst_subset_expr = ", ".join([f"{b}:{e+1}:1" for b, e, s in dst_subset])
+
+    # Add copy kernel
+    tasklet, map_entry, map_exit = parent_state.add_mapped_tasklet(
+        name="gpu_copy_kernel_fallback",
+        map_ranges=map_ranges,
+        inputs={"_in": dace.memlet.Memlet(f"{view_src_node.data}[{access_expr}]")},
+        outputs={"_out": dace.memlet.Memlet(f"{view_dst_node.data}[{access_expr}]")},
+        code="_out = _in",
+        schedule=dtypes.ScheduleType.GPU_Device,
+        unroll_map=False,
+        language=dtypes.Language.Python,
+        external_edges=True,
+        propagate=True,
+        input_nodes={view_src_node.data: view_src_node},
+        output_nodes={view_dst_node.data: view_dst_node},
+    )
+
+    return view_src_node, src_subset_expr, view_dst_node, dst_subset_expr
+
+
+@properties.make_properties
+@transformation.explicit_cf_compatible
+class InsertExplicitGPUGlobalMemoryCopies(ppl.Pass):
+    """
+    This pass inserts explicit copy tasklets for data transfers that need to be handled
+    by the GPU and occur outside a kernel (for example, copying data from host memory
+    to the GPU before executing a kernel).
+
+    It identifies such copy locations and inserts the corresponding tasklets. For each
+    memlet path describing a copy, the first edge is duplicated: one edge goes from the original
+    source to the tasklet, and the other from the tasklet to the original destination, while
+    the original edge is removed.
+
+    This is experimental and could later serve as inspiration for making all copies explicit.
+    Considerations for future work include allowing tasklets to access array addresses
+    from connectors and describing in memlets how data will be moved, since currently
+    tasklets only support value inputs.
+    """
+
+    def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
+        depending_passes = set()
+        return depending_passes
+
+    def modifies(self) -> ppl.Modifies:
+        return ppl.Modifies.Tasklets | ppl.Modifies.Memlets
+
+    def should_reapply(self, modified: ppl.Modifies) -> bool:
+        return False
+
+    def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict:
+        """
+        Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling.
+        Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel
+        function.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to transform by adding out-of-kernel GPU copy tasklets.
+        pipeline_results : Dict[str, Any]
+            Results from previous transformation passes, including GPU stream assignments.
+
+        Returns
+        -------
+        dict
+            Currently returns an empty dictionary.
+        """
+        # Prepare GPU stream
+
+        # Initialize the strategy for copies that occur outside of kernel execution
+        out_of_kernel_copy = OutOfKernelCopyStrategy()
+
+        # Get all data copies to process the out of kernel copies
+        copy_worklist = self.find_all_data_copies(sdfg)
+
+        for copy_sdfg, state, src_node, dst_node, edge in copy_worklist:
+
+            copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge)
+
+            # Only insert copy tasklets for GPU related copies occuring out of the
+            # kernel (i.e. a GPU_device scheduled map)
+            if not out_of_kernel_copy.applicable(copy_context):
+                continue
+
+            # If the subset has more than 2 dimensions and is not contiguous (represented as a 1D memcpy) then fallback to a copy kernel
+            if len(edge.data.subset) > 2 and not edge.data.subset.is_contiguous_subset(
+                    state.sdfg.arrays[edge.data.data]):
+
+                # If other subset is not None, we do not need a nested SDFG
+                if edge.data.other_subset is None:
+                    # Currently only 1D and 2D copies are supported
+                    map_ranges = dict()
+                    for i, dim in enumerate(edge.data.subset):
+                        map_ranges[f"i{i}"] = f"{dim[0]}:{dim[1]+1}:{dim[2]}"
+                    access_expr = ",".join(f"i{i}" for i in range(len(edge.data.subset)))
+
+                    tasklet, map_entry, map_exit = state.add_mapped_tasklet(
+                        name="gpu_copy_kernel_fallback",
+                        map_ranges=map_ranges,
+                        inputs={"_in": dace.memlet.Memlet(f"{src_node.data}[{access_expr}]")},
+                        outputs={"_out": dace.memlet.Memlet(f"{dst_node.data}[{access_expr}]")},
+                        code="_out = _in",
+                        schedule=dtypes.ScheduleType.GPU_Device,
+                        unroll_map=False,
+                        language=dtypes.Language.Python,
+                        external_edges=True,
+                        propagate=True,
+                        input_nodes={src_node.data: src_node},
+                        output_nodes={dst_node.data: dst_node},
+                    )
+                    # Add connectors to the out edge of map_entry and in edge of map_exit
+                    state.remove_edge(edge)
+                else:
+                    view_src_node, src_subset_expr, view_dst_node, dst_subset_expr = create_viewed_copy_kernel(
+                        state, src_node, dst_node, edge)
+                    state.remove_edge(edge)
+                    state.add_edge(src_node, None, view_src_node, "views",
+                                   dace.Memlet(f"{src_node.data}[{src_subset_expr}]"))
+                    state.add_edge(view_dst_node, "views", dst_node, None,
+                                   dace.Memlet(f"{dst_node.data}[{dst_subset_expr}]"))
+            else:
+                # Generatae the copy call
+                code = out_of_kernel_copy.generate_copy(copy_context)
+
+                # Prepare GPU ustream connectors and the stream to be accessed from the
+                # GPU stream array
+                # Create the tasklet and add GPU stream related connectors
+                tasklet = state.add_tasklet("gpu_copy", {"_in_" + src_node.data}, {"_out_" + dst_node.data},
+                                            code,
+                                            language=dtypes.Language.CPP)
+
+                # Put the tasklet in between the edge
+                dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge
+
+                if memlet.other_subset is None:
+                    src_memlet = copy.deepcopy(memlet)
+                    src_memlet.data = src_node.data
+                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data, src_memlet)
+                    dst_memlet = copy.deepcopy(memlet)
+                    dst_memlet.data = dst_node.data
+                    state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn, dst_memlet)
+                    state.remove_edge(edge)
+                else:
+                    src_subset = memlet.subset if edge.data.data == src_node.data else memlet.other_subset
+                    dst_subset = memlet.other_subset if edge.data.data == src_node.data else memlet.subset
+                    state.add_edge(dst_node_pred, dst_node_conn, tasklet, "_in_" + src_node.data,
+                                   dace.Memlet(data=src_node.data, subset=src_subset))
+                    state.add_edge(tasklet, "_out_" + dst_node.data, dst_node, dst_conn,
+                                   dace.Memlet(data=dst_node.data, subset=dst_subset))
+                    state.remove_edge(edge)
+
+        return {}
+
+    def find_all_data_copies(
+            self, sdfg: SDFG
+    ) -> List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node, MultiConnectorEdge[dace.Memlet]]]:
+        """
+        Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node,
+        destination node, and the first memlet edge of in the memlet path between source and destination node.
+
+        Parameters
+        ----------
+        sdfg : SDFG
+            The SDFG to analyze for potential data copies.
+
+        Returns
+        -------
+        List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node, MultiConnectorEdge[dace.Memlet]]]
+            A list of tuples representing the data copy, each containing:
+            - The SDFG containing the copy
+            - The state in which the copy occurs
+            - The source node of the copy
+            - The destination node of the copy
+            - The first memlet edge representing the data movement
+        """
+        copy_worklist: List[Tuple[SDFG, SDFGState, dace.nodes.Node, dace.nodes.Node,
+                                  MultiConnectorEdge[dace.Memlet]]] = []
+        visited_edges: Set[MultiConnectorEdge[dace.Memlet]] = set()
+
+        for sub_sdfg in sdfg.all_sdfgs_recursive():
+            for state in sub_sdfg.states():
+                for edge in state.edges():
+
+                    # Skip edges that were already processed
+                    if edge in visited_edges:
+                        continue
+
+                    # Get the memlet path and mark all edges in the path as visited
+                    memlet_path = state.memlet_path(edge)
+                    visited_edges.update(set(memlet_path))
+
+                    # Get source and destination noces
+                    first_edge = memlet_path[0]
+                    last_edge = memlet_path[-1]
+                    src_node = first_edge.src
+                    dst_node = last_edge.dst
+
+                    # Skip empty memlets
+                    if first_edge.data.subset is None:
+                        continue
+
+                    # Add copy to the worklist
+                    copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge))
+
+        return copy_worklist
diff --git a/tests/gpu_specialization/explicit_global_memory_copy_test.py b/tests/gpu_specialization/explicit_global_memory_copy_test.py
new file mode 100644
index 0000000000..92cefed48a
--- /dev/null
+++ b/tests/gpu_specialization/explicit_global_memory_copy_test.py
@@ -0,0 +1,331 @@
+# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+import pytest
+import numpy as np
+from typing import Tuple
+from dace.transformation.passes.gpu_specialization.insert_explicit_gpu_global_memory_copies import InsertExplicitGPUGlobalMemoryCopies
+
+
+def _get_sdfg(name_str: str, dimension: Tuple[int], copy_strides: Tuple[int]) -> dace.SDFG:
+    sdfg = dace.SDFG(name_str)
+    state = sdfg.add_state("state0", is_start_block=True)
+    for arr_name in ["A", "B"]:
+        sdfg.add_array(arr_name, dimension, dace.float32, dace.dtypes.StorageType.GPU_Global)
+    a = state.add_access("A")
+    b = state.add_access("B")
+    copy_str = ", ".join([f"0:{dimension[i]}:{copy_strides[i]}" for i in range(len(dimension))])
+    state.add_edge(a, None, b, None, dace.Memlet(f"A[{copy_str}]"))
+    sdfg.validate()
+    return sdfg
+
+
+def _get_sdfg_with_other_subset(name_str: str, dimension: Tuple[int], copy_strides: Tuple[int]) -> dace.SDFG:
+    sdfg = dace.SDFG(name_str)
+    state = sdfg.add_state("state0", is_start_block=True)
+    for arr_name in ["A", "B"]:
+        sdfg.add_array(arr_name, dimension, dace.float32, dace.dtypes.StorageType.GPU_Global)
+    a = state.add_access("A")
+    b = state.add_access("B")
+    # copy_str = ", ".join([f"0:{dimension[i]}:{copy_strides[i]}" for i in range(len(dimension))])
+    src_subset = dace.subsets.Range([((dimension[i] // 2), dimension[i] - 1, copy_strides[i])
+                                     for i in range(len(dimension))])
+    dst_subset = dace.subsets.Range([(0, (dimension[i] // 2) - 1, copy_strides[i]) for i in range(len(dimension))])
+    state.add_edge(a, None, b, None, dace.Memlet(data="B", subset=dst_subset, other_subset=src_subset))
+    sdfg.validate()
+    return sdfg
+
+
+def _count_tasklets(sdfg: dace.SDFG) -> int:
+    """Count the number of tasklets in the SDFG."""
+    count = 0
+    for state in sdfg.nodes():
+        for node in state.nodes():
+            if isinstance(node, dace.nodes.Tasklet):
+                count += 1
+    return count
+
+
+def _count_nsdfgs(sdfg: dace.SDFG) -> int:
+    """Count the number of nested SDFGs in the SDFG."""
+    count = 0
+    for state in sdfg.nodes():
+        for node in state.nodes():
+            if isinstance(node, dace.nodes.NestedSDFG):
+                count += 1
+    return count
+
+
+@pytest.mark.gpu
+def test_1d_copy():
+    """Test 1D unit stride copy."""
+    import cupy as cp
+
+    dimension = (8, )
+    copy_strides = (1, )
+
+    sdfg = _get_sdfg("test_1d_copy", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[::copy_strides[0]]
+    cp.testing.assert_array_equal(B, expected)
+    assert num_tasklets == 1
+
+
+@pytest.mark.gpu
+def test_1d_copy_w_other_subset():
+    """Test 1D unit stride copy."""
+    import cupy as cp
+
+    dimension = (8, )
+    copy_strides = (1, )
+
+    sdfg = _get_sdfg_with_other_subset("test_1d_copy_w_other_subset", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+    sdfg.save("x.sdfg")
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[4:8:copy_strides[0]]
+    cp.testing.assert_array_equal(B[0:4], expected)
+    assert num_tasklets == 1
+
+
+@pytest.mark.gpu
+def test_2d_copy():
+    """Test 2D unit stride copy with other subset not None."""
+    import cupy as cp
+
+    dimension = (8, 8)
+    copy_strides = (1, 1)
+
+    sdfg = _get_sdfg("test_2d_copy", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[::copy_strides[0], ::copy_strides[1]]
+    cp.testing.assert_array_equal(B, expected)
+
+    assert num_tasklets == 1
+
+
+@pytest.mark.gpu
+def test_2d_copy_with_other_subset():
+    """Test 2D unit stride copy with other subset not None."""
+    import cupy as cp
+
+    dimension = (8, 8)
+    copy_strides = (1, 1)
+
+    sdfg = _get_sdfg_with_other_subset("test_2d_copy_with_other_subset", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[4:8:copy_strides[0], 4:8:copy_strides[1]]
+    cp.testing.assert_array_equal(B[0:4, 0:4], expected)
+    assert num_tasklets == 1
+
+
+@pytest.mark.gpu
+def test_3d_copy():
+    """Test 3D unit stride copy."""
+    import cupy as cp
+
+    dimension = (8, 4, 4)
+    copy_strides = (1, 1, 1)
+
+    sdfg = _get_sdfg("test_3d_copy", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = A[::copy_strides[0], ::copy_strides[1], ::copy_strides[2]]
+    cp.testing.assert_array_equal(B, expected)
+
+    assert num_tasklets == 1
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride", [2, 4])
+def test_1d_strided_copy(stride):
+    """Test 1D strided copy with varying strides."""
+    import cupy as cp
+
+    dimension = (8, )
+    copy_strides = (stride, )
+
+    sdfg = _get_sdfg(f"test_1d_strided_copy_s{stride}", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness - only elements at stride intervals should be copied
+    expected = cp.zeros_like(A)
+    expected[::stride] = A[::stride]
+    cp.testing.assert_array_equal(B[::stride], expected[::stride])
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride_1,stride_2", [(2, 1), (4, 1), (1, 2), (1, 4)])
+def test_2d_strided_copy(stride_1, stride_2):
+    """Test 2D strided copy. First dimension is unit stride, second is strided."""
+    import cupy as cp
+
+    dimension = (8, 4)
+    copy_strides = (stride_1, stride_2)
+
+    sdfg = _get_sdfg(f"test_2d_strided_copy_s{stride_1}_{stride_2}", dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = cp.zeros_like(A)
+    expected[::stride_1, ::stride_2] = A[::stride_1, ::stride_2]
+    cp.testing.assert_array_equal(B[::stride_1, ::stride_2], expected[::stride_1, ::stride_2])
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride_1,stride_2,stride_3", [(1, 2, 2), (1, 2, 4), (1, 4, 2), (4, 1, 1), (4, 2, 1),
+                                                        (2, 2, 1)])
+def test_3d_strided_copy(stride_1, stride_2, stride_3):
+    """Test 3D strided copy. First dimension is unit stride, others are strided."""
+    import cupy as cp
+
+    dimension = (8, 4, 4)
+    copy_strides = (stride_1, stride_2, stride_3)
+
+    sdfg = _get_sdfg(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}", dimension, copy_strides)
+    sdfg.save("x1.sdfg")
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+    sdfg.save("x2.sdfg")
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    expected = cp.zeros_like(A)
+    expected[::stride_1, ::stride_2, ::stride_3] = A[::stride_1, ::stride_2, ::stride_3]
+    cp.testing.assert_array_equal(B, expected)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize("stride_1,stride_2,stride_3", [
+    (1, 2, 2),
+    (1, 2, 4),
+    (1, 4, 2),
+    (2, 2, 1),
+])
+def test_3d_strided_copy_w_other_subset(stride_1, stride_2, stride_3):
+    """Test 3D strided copy. First dimension is unit stride, others are strided."""
+    import cupy as cp
+
+    dimension = (8, 8, 8)
+    copy_strides = (stride_1, stride_2, stride_3)
+
+    sdfg = _get_sdfg_with_other_subset(f"test_3d_strided_copy_s{stride_1}_{stride_2}_{stride_3}_w_other_subset",
+                                       dimension, copy_strides)
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+
+    # Count tasklets
+    num_tasklets = _count_tasklets(sdfg)
+    assert num_tasklets == 1
+
+    # Test with cupy
+    A = cp.random.rand(*dimension).astype(np.float32)
+    B = cp.zeros_like(A)
+
+    sdfg(A=A, B=B)
+
+    # Verify correctness
+    cp.testing.assert_array_equal(B[0:4:copy_strides[0], 0:4:copy_strides[1], 0:4:copy_strides[2]],
+                                  A[4:8:copy_strides[0], 4:8:copy_strides[1], 4:8:copy_strides[2]])
+
+
+@pytest.mark.gpu
+def test_independent_copies():
+
+    @dace.program
+    def independent_copies(A: dace.uint32[128], B: dace.uint32[128], C: dace.uint32[128], D: dace.uint32[128]):
+        for i in dace.map[0:128:1]:
+            B[i] = A[i]
+        for i in dace.map[0:128:1]:
+            D[i] = C[i]
+
+    sdfg = independent_copies.to_sdfg()
+    sdfg.apply_gpu_transformations()
+    sdfg.validate()
+    sdfg.save("s1.sdfg")
+
+    InsertExplicitGPUGlobalMemoryCopies().apply_pass(sdfg, {})
+    sdfg.save("s2.sdfg")
+
+    sdfg.validate()
+    sdfg.compile()