QuantumKitHub · kshyatt · Mar 6, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
@@ -2,6 +2,7 @@ module StridedGPUArraysExt
 
 using Strided, GPUArrays
 using GPUArrays: Adapt, KernelAbstractions
+using GPUArrays.KernelAbstractions: @kernel, @index
 
 ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
@@ -19,4 +20,27 @@ function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS
     return dst
 end
 
+# lifted from GPUArrays.jl
+function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS}
+    isempty(A) && return A
+    @kernel function fill_kernel!(a, val)
+        idx = @index(Global, Linear)
+        @inbounds a[idx] = val
+    end
+    # ndims check for 0D support
+    kernel = fill_kernel!(KernelAbstractions.get_backend(A))
+    f_x = F <: Union{typeof(conj), typeof(adjoint)} ? conj(x) : x
+    kernel(A, f_x; ndrange = length(A))
+    return A
+end
+
+function Strided.__mul!(
+        C::StridedView{TC, 2, <:AnyGPUArray{TC}},
+        A::StridedView{TA, 2, <:AnyGPUArray{TA}},
+        B::StridedView{TB, 2, <:AnyGPUArray{TB}},
+        α::Number, β::Number
+    ) where {TC, TA, TB}
+    return GPUArrays.generic_matmatmul!(C, A, B, α, β)
+end
+
 end
diff --git a/ext/StridedJLArraysExt.jl b/ext/StridedJLArraysExt.jl
@@ -2,14 +2,13 @@ module StridedJLArraysExt
 
 using Strided, StridedViews, JLArrays
 using JLArrays: Adapt
-using JLArrays: GPUArrays
 
 const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
 function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TAD <: JLArray{TD}, FD <: ALL_FS, TS <: Number, NS, TAS <: JLArray{TS}, FS <: ALL_FS}
     bc_style = Base.Broadcast.BroadcastStyle(TAS)
     bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
-    GPUArrays._copyto!(dst, bc)
+    JLArrays.GPUArrays._copyto!(dst, bc)
     return dst
 end
 

diff --git a/test/amd.jl b/test/amd.jl
@@ -16,6 +16,8 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
             axes(f1(A1)) == axes(f2(A2)) || continue
             @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == AMDGPU.Adapt.adapt(Vector{T}, copy!(B2, B1))
             @test copy!(zA1, f1(A1)) == copy!(zA2, B1)
+            x = rand(T)
+            @test f1(StridedView(AMDGPU.Adapt.adapt(Vector{T}, fill!(A1c, x)))) == AMDGPU.Adapt.adapt(Vector{T}, fill!(B1, x))
         end
     end
 end
diff --git a/test/cuda.jl b/test/cuda.jl
@@ -12,6 +12,8 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
             axes(f1(A1)) == axes(f2(A2)) || continue
             @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == CUDA.Adapt.adapt(Vector{T}, copy!(B2, B1))
             @test copy!(zA1, f1(A1)) == copy!(zA2, B1)
+            x = rand(T)
+            @test f1(StridedView(CUDA.Adapt.adapt(Vector{T}, fill!(A1c, x)))) == CUDA.Adapt.adapt(Vector{T}, fill!(B1, x))
         end
     end
 end
diff --git a/test/jlarrays.jl b/test/jlarrays.jl
@@ -1,4 +1,4 @@
-for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
+@testset for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
     @testset "Copy with JLArrayStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
         for m1 in (0, 16, 32), m2 in (0, 16, 32)
             A1 = JLArray(randn(T, (m1, m2)))
@@ -12,6 +12,8 @@ for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
             axes(f1(A1)) == axes(f2(A2)) || continue
             @test collect(Matrix(copy!(f2(A2), f1(A1)))) == JLArrays.Adapt.adapt(Vector{T}, copy!(B2, B1))
             @test copy!(zA1, f1(A1)) == copy!(zA2, B1)
+            x = rand(T)
+            @test f1(StridedView(JLArrays.Adapt.adapt(Vector{T}, fill!(A1c, x)))) == JLArrays.Adapt.adapt(Vector{T}, fill!(B1, x))
         end
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -11,6 +11,7 @@ Random.seed!(1234)
 is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
 
 if !is_buildkite
+    include("jlarrays.jl")
     println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
 
     println("Running tests single-threaded:")
@@ -28,7 +29,6 @@ if !is_buildkite
     include("blasmultests.jl")
     Strided.disable_threaded_mul()
 
-    include("jlarrays.jl")
     Aqua.test_all(Strided; piracies = false)
 end