From 8702594a40e6ad40445bbcbeffc182824605fe4a Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Wed, 18 Mar 2026 10:27:42 -0400 Subject: [PATCH 1/4] Slow but working copyto --- ext/StridedGPUArraysExt.jl | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 608d8b5..7374000 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -1,6 +1,6 @@ module StridedGPUArraysExt -using Strided, GPUArrays +using Strided, GPUArrays, LinearAlgebra using GPUArrays: Adapt, KernelAbstractions using GPUArrays.KernelAbstractions: @kernel, @index @@ -34,7 +34,29 @@ function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractG return A end -function Strided.__mul!( +# kernel-based variant for copying between wrapped GPU arrays +@kernel function linear_copy_kernel!(dest, dstart, src, sstart, n) + i = @index(Global, Linear) + if i <= n + @inbounds dest[dstart+i-1] = src[sstart+i-1] + end +end + +function Base.copyto!(dest::StridedView{TD, ND, TAD, FD}, dstart::Integer, + src::StridedView{TS, NS, TAS, FS}, sstart::Integer, n::Integer) where {TD, TS, ND, NS, TAD <: AbstractGPUArray{TD}, TAS <: AbstractGPUArray{TS}, FD, FS} + n == 0 && return dest + n < 0 && throw(ArgumentError(string("tried to copy n=", n, " elements, but n should be nonnegative"))) + destinds, srcinds = LinearIndices(dest), LinearIndices(src) + (checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart+n-1)) || throw(BoundsError(dest, dstart:dstart+n-1)) + (checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart+n-1)) || throw(BoundsError(src, sstart:sstart+n-1)) + kernel = linear_copy_kernel!(KernelAbstractions.get_backend(dest)) + kernel(dest, dstart, src, sstart, n; ndrange=n) + return dest +end +Base.copyto!(dest::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, TS, ND, NS, TAD <: AbstractGPUArray{TD}, TAS <: AbstractGPUArray{TS}, FD, FS} = copyto!(dest, 1, src, 1, length(src)) + + +function LinearAlgebra.mul!( C::StridedView{TC, 2, <:AnyGPUArray{TC}}, A::StridedView{TA, 2, <:AnyGPUArray{TA}}, B::StridedView{TB, 2, <:AnyGPUArray{TB}}, From aa0c57b75ceb3f134868828ecd7d0b5c08148f78 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Wed, 18 Mar 2026 10:40:06 -0400 Subject: [PATCH 2/4] Formatter --- ext/StridedGPUArraysExt.jl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 7374000..992c8c0 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -38,19 +38,21 @@ end @kernel function linear_copy_kernel!(dest, dstart, src, sstart, n) i = @index(Global, Linear) if i <= n - @inbounds dest[dstart+i-1] = src[sstart+i-1] + @inbounds dest[dstart + i - 1] = src[sstart + i - 1] end end -function Base.copyto!(dest::StridedView{TD, ND, TAD, FD}, dstart::Integer, - src::StridedView{TS, NS, TAS, FS}, sstart::Integer, n::Integer) where {TD, TS, ND, NS, TAD <: AbstractGPUArray{TD}, TAS <: AbstractGPUArray{TS}, FD, FS} +function Base.copyto!( + dest::StridedView{TD, ND, TAD, FD}, dstart::Integer, + src::StridedView{TS, NS, TAS, FS}, sstart::Integer, n::Integer + ) where {TD, TS, ND, NS, TAD <: AbstractGPUArray{TD}, TAS <: AbstractGPUArray{TS}, FD, FS} n == 0 && return dest n < 0 && throw(ArgumentError(string("tried to copy n=", n, " elements, but n should be nonnegative"))) destinds, srcinds = LinearIndices(dest), LinearIndices(src) - (checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart+n-1)) || throw(BoundsError(dest, dstart:dstart+n-1)) - (checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart+n-1)) || throw(BoundsError(src, sstart:sstart+n-1)) + (checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart + n - 1)) || throw(BoundsError(dest, dstart:(dstart + n - 1))) + (checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart + n - 1)) || throw(BoundsError(src, sstart:(sstart + n - 1))) kernel = linear_copy_kernel!(KernelAbstractions.get_backend(dest)) - kernel(dest, dstart, src, sstart, n; ndrange=n) + kernel(dest, dstart, src, sstart, n; ndrange = n) return dest end Base.copyto!(dest::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, TS, ND, NS, TAD <: AbstractGPUArray{TD}, TAS <: AbstractGPUArray{TS}, FD, FS} = copyto!(dest, 1, src, 1, length(src)) From f5372e8254e9ca49261d1213d5c956e32c969ee2 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Wed, 18 Mar 2026 13:42:50 -0400 Subject: [PATCH 3/4] Another copyto --- ext/StridedGPUArraysExt.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 992c8c0..4eb7d1f 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -20,6 +20,14 @@ function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS return dst end +function Base.copyto!(dest::StridedView{T, N, <:AnyGPUArray{T}}, bc::Base.Broadcast.Broadcasted{Strided.StridedArrayStyle{N}}) where {T <: Number, N} + dims = size(dest) + any(isequal(0), dims) && return dest + + GPUArrays._copyto!(dest, bc) + return dest +end + # lifted from GPUArrays.jl function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS} isempty(A) && return A From ab912b48c3c452ba39fe67a25ba5ba9bb3c1bd18 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 19 Mar 2026 07:20:15 -0400 Subject: [PATCH 4/4] Remove unneeded copyto --- ext/StridedGPUArraysExt.jl | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 4eb7d1f..5443e74 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -42,30 +42,6 @@ function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractG return A end -# kernel-based variant for copying between wrapped GPU arrays -@kernel function linear_copy_kernel!(dest, dstart, src, sstart, n) - i = @index(Global, Linear) - if i <= n - @inbounds dest[dstart + i - 1] = src[sstart + i - 1] - end -end - -function Base.copyto!( - dest::StridedView{TD, ND, TAD, FD}, dstart::Integer, - src::StridedView{TS, NS, TAS, FS}, sstart::Integer, n::Integer - ) where {TD, TS, ND, NS, TAD <: AbstractGPUArray{TD}, TAS <: AbstractGPUArray{TS}, FD, FS} - n == 0 && return dest - n < 0 && throw(ArgumentError(string("tried to copy n=", n, " elements, but n should be nonnegative"))) - destinds, srcinds = LinearIndices(dest), LinearIndices(src) - (checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart + n - 1)) || throw(BoundsError(dest, dstart:(dstart + n - 1))) - (checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart + n - 1)) || throw(BoundsError(src, sstart:(sstart + n - 1))) - kernel = linear_copy_kernel!(KernelAbstractions.get_backend(dest)) - kernel(dest, dstart, src, sstart, n; ndrange = n) - return dest -end -Base.copyto!(dest::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, TS, ND, NS, TAD <: AbstractGPUArray{TD}, TAS <: AbstractGPUArray{TS}, FD, FS} = copyto!(dest, 1, src, 1, length(src)) - - function LinearAlgebra.mul!( C::StridedView{TC, 2, <:AnyGPUArray{TC}}, A::StridedView{TA, 2, <:AnyGPUArray{TA}},