From e9cd9c168c0c3ef7037fbd5eba11f1dbb84efc8f Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sun, 15 Mar 2026 08:48:42 -0400
Subject: [PATCH 01/17] Add GPU mapreduce support via KernelAbstractions kernel

Override _mapreduce_fuse! for GPU-backed StridedViews to dispatch
to a KernelAbstractions kernel instead of the CPU-specific threaded/SIMD
path. One GPU thread per output element with a sequential inner loop
over reduction dimensions. Handles pure map (op=nothing), reductions,
initop, and conj/adjoint views via ParentIndex semantics.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ext/StridedGPUArraysExt.jl | 86 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 608d8b5..7c14ae7 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -3,6 +3,7 @@ module StridedGPUArraysExt
 using Strided, GPUArrays
 using GPUArrays: Adapt, KernelAbstractions
 using GPUArrays.KernelAbstractions: @kernel, @index
+using StridedViews: ParentIndex
 
 ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
@@ -43,4 +44,89 @@ function Strided.__mul!(
     return GPUArrays.generic_matmatmul!(C, A, B, α, β)
 end
 
+# ---------- GPU mapreduce support ----------
+
+@inline _gpu_init_acc(::Nothing, current_val) = current_val
+@inline _gpu_init_acc(initop, current_val) = initop(current_val)
+
+@inline _gpu_accum(::Nothing, acc, val) = val
+@inline _gpu_accum(op, acc, val) = op(acc, val)
+
+@inline function _strides_dot(strides::NTuple{N, Int}, cidx::CartesianIndex{N}) where {N}
+    s = 0
+    for d in Base.OneTo(N)
+        @inbounds s += strides[d] * (cidx[d] - 1)
+    end
+    return s
+end
+
+@kernel function _mapreduce_gpu_kernel!(
+        f, op, initop,
+        dims::NTuple{N, Int},
+        out::OT,
+        inputs::IT
+    ) where {N, OT <: StridedView, IT <: Tuple}
+
+    out_linear = @index(Global, Linear)
+
+    # Non-reduction subspace sizes (1 for reduction dims)
+    nred_sizes = ntuple(Val(N)) do d
+        @inbounds iszero(out.strides[d]) ? 1 : dims[d]
+    end
+    # Reduction subspace sizes (1 for non-reduction dims)
+    red_sizes = ntuple(Val(N)) do d
+        @inbounds iszero(out.strides[d]) ? dims[d] : 1
+    end
+
+    # Map out_linear → cartesian in non-reduction subspace
+    nred_cidx = CartesianIndices(nred_sizes)[out_linear]
+    out_parent = out.offset + 1 + _strides_dot(out.strides, nred_cidx)
+
+    # Initialize accumulator from current output value (or apply initop)
+    @inbounds acc = _gpu_init_acc(initop, out[ParentIndex(out_parent)])
+
+    # Sequential reduction loop over reduction subspace
+    @inbounds for red_linear in Base.OneTo(prod(red_sizes))
+        red_cidx = CartesianIndices(red_sizes)[red_linear]
+        complete_cidx = CartesianIndex(ntuple(Val(N)) do d
+            @inbounds nred_cidx[d] + red_cidx[d] - 1
+        end)
+
+        val = f(ntuple(Val(length(inputs))) do m
+            @inbounds begin
+                a = inputs[m]
+                ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx)
+                a[ParentIndex(ip)]
+            end
+        end...)
+
+        acc = _gpu_accum(op, acc, val)
+    end
+
+    @inbounds out[ParentIndex(out_parent)] = acc
+end
+
+function Strided._mapreduce_fuse!(
+        f, op, initop,
+        dims::Dims{N},
+        arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView}}
+    ) where {TO, N}
+
+    out = arrays[1]
+    inputs_raw = Base.tail(arrays)
+    M = length(inputs_raw)
+    inputs = ntuple(i -> inputs_raw[i], Val(M))
+
+    # Number of output elements = product of non-reduction dims
+    out_total = prod(ntuple(Val(N)) do d
+        @inbounds iszero(out.strides[d]) ? 1 : dims[d]
+    end)
+
+    backend = KernelAbstractions.get_backend(parent(out))
+    kernel! = _mapreduce_gpu_kernel!(backend)
+    kernel!(f, op, initop, dims, out, inputs; ndrange = out_total)
+
+    return nothing
+end
+
 end

From 96b7d09f2cd8e121f4796df0af62e336cad542c3 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sun, 15 Mar 2026 08:49:02 -0400
Subject: [PATCH 02/17] Add GPU mapreduce tests using JLArrays

Tests cover: pure map!, reduction over dim 1, reduction over dim 2,
conj/adjoint StridedView, and full scalar reduction. JLArrays provides
a CPU-backed GPU simulator so tests run without real GPU hardware.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 test/mapreduce_gpu.jl | 45 +++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl      |  3 +++
 2 files changed, 48 insertions(+)
 create mode 100644 test/mapreduce_gpu.jl

diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl
new file mode 100644
index 0000000..3845d25
--- /dev/null
+++ b/test/mapreduce_gpu.jl
@@ -0,0 +1,45 @@
+using Test, Strided, StridedViews, JLArrays
+
+@testset "GPU map! via StridedView" begin
+    for T in (Float32, Float64, ComplexF32, ComplexF64)
+        A = JLArray(rand(T, 8, 6))
+        B = similar(A)
+        map!(x -> 2x, StridedView(B), StridedView(A))
+        @test Array(B) ≈ 2 .* Array(A)
+    end
+end
+
+@testset "GPU mapreducedim! — sum over dim 1" begin
+    for T in (Float32, Float64)
+        A = JLArray(rand(T, 8, 6))
+        B = JLArray(zeros(T, 1, 6))
+        sum!(StridedView(B), StridedView(A))
+        @test Array(B) ≈ sum(Array(A); dims = 1)
+    end
+end
+
+@testset "GPU mapreducedim! — sum over dim 2" begin
+    for T in (Float32, Float64)
+        A = JLArray(rand(T, 8, 6))
+        B = JLArray(zeros(T, 8, 1))
+        sum!(StridedView(B), StridedView(A))
+        @test Array(B) ≈ sum(Array(A); dims = 2)
+    end
+end
+
+@testset "GPU map! with conj/adjoint StridedView" begin
+    for T in (ComplexF32, ComplexF64)
+        A = JLArray(rand(T, 4, 4))
+        B = JLArray(zeros(T, 4, 4))
+        copy!(adjoint(StridedView(B)), StridedView(A))
+        @test Array(B) ≈ conj(Array(A))
+    end
+end
+
+@testset "GPU mapreduce — full scalar reduction" begin
+    for T in (Float32, Float64)
+        A = JLArray(rand(T, 8, 6))
+        result = sum(StridedView(A))
+        @test result ≈ sum(Array(A))
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 4108876..a26b337 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,6 +12,9 @@ is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
 
 if !is_buildkite
     include("jlarrays.jl")
+    @testset "JLArray GPU mapreduce" begin
+        include("mapreduce_gpu.jl")
+    end
     println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
 
     println("Running tests single-threaded:")

From 5e6900298e17f53e3d263aeffd345e8a8d2587e7 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sun, 15 Mar 2026 09:04:38 -0400
Subject: [PATCH 03/17] Fix GPU mapreduce: scalar indexing, output type, test
 expectations

Three fixes:
- Add _mapreduce GPU override to avoid scalar indexing (first(A),
  out[ParentIndex(1)]) which JLArrays/real GPUs prohibit; uses zero(T)
  as proxy for type inference and similar(parent(A),...) to ensure the
  output stays on the GPU device
- Fix adjoint test expectation: copy!(adjoint(B), A) gives B = adjoint(A),
  not conj(A)
- Use qualified names Strided._init_reduction! and Strided._mapreducedim!
  since they are not exported into the extension module

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ext/StridedGPUArraysExt.jl | 27 +++++++++++++++++++++++++++
 test/mapreduce_gpu.jl      |  3 ++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 7c14ae7..5ae0644 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -106,6 +106,33 @@ end
     @inbounds out[ParentIndex(out_parent)] = acc
 end
 
+# GPU-compatible _mapreduce: avoids scalar indexing (first(A), out[ParentIndex(1)])
+# that JLArrays/real GPUs prohibit. Uses zero(T) as a proxy to infer the output
+# element type without reading from the device.
+function Strided._mapreduce(
+        f, op, A::StridedView{T, N, <:AnyGPUArray{T}}, nt = nothing
+    ) where {T, N}
+    if length(A) == 0
+        b = Base.mapreduce_empty(f, op, T)
+        return nt === nothing ? b : op(b, nt.init)
+    end
+
+    dims = size(A)
+
+    if nt === nothing
+        a_zero = Base.mapreduce_first(f, op, zero(T))
+        out = similar(parent(A), typeof(a_zero), (1,))
+        Strided._init_reduction!(out, f, op, a_zero)
+    else
+        out = similar(parent(A), typeof(nt.init), (1,))
+        fill!(out, nt.init)
+    end
+
+    Strided._mapreducedim!(f, op, nothing, dims, (sreshape(StridedView(out), one.(dims)), A))
+
+    return Array(out)[1]
+end
+
 function Strided._mapreduce_fuse!(
         f, op, initop,
         dims::Dims{N},
diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl
index 3845d25..c959c4b 100644
--- a/test/mapreduce_gpu.jl
+++ b/test/mapreduce_gpu.jl
@@ -32,7 +32,7 @@ end
         A = JLArray(rand(T, 4, 4))
         B = JLArray(zeros(T, 4, 4))
         copy!(adjoint(StridedView(B)), StridedView(A))
-        @test Array(B) ≈ conj(Array(A))
+        @test Array(B) ≈ adjoint(Array(A))
     end
 end
 
@@ -40,6 +40,7 @@ end
     for T in (Float32, Float64)
         A = JLArray(rand(T, 8, 6))
         result = sum(StridedView(A))
+        @test result isa T
         @test result ≈ sum(Array(A))
     end
 end

From b21ea74a02772ee28b3510249ebf4bbdd20da445 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sun, 15 Mar 2026 09:17:32 -0400
Subject: [PATCH 04/17] Use GPUArrays.neutral_element for _mapreduce init value

Replace zero(T) proxy with the same pattern GPUArrays uses:
infer the output element type via Broadcast.combine_eltypes +
Base.promote_op, then call GPUArrays.neutral_element(op, ET).
Unknown operators now produce a clear error message rather than
silently using zero(T). Also removes the dependency on the
unexported Strided._init_reduction!.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ext/StridedGPUArraysExt.jl | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 5ae0644..2023afd 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -107,8 +107,9 @@ end
 end
 
 # GPU-compatible _mapreduce: avoids scalar indexing (first(A), out[ParentIndex(1)])
-# that JLArrays/real GPUs prohibit. Uses zero(T) as a proxy to infer the output
-# element type without reading from the device.
+# that JLArrays/real GPUs prohibit. Mirrors GPUArrays' neutral_element approach:
+# infer output type via Broadcast machinery, look up the neutral element (errors on
+# unknown ops), fill the output buffer, then read back a single scalar via Array().
 function Strided._mapreduce(
         f, op, A::StridedView{T, N, <:AnyGPUArray{T}}, nt = nothing
     ) where {T, N}
@@ -120,14 +121,19 @@ function Strided._mapreduce(
     dims = size(A)
 
     if nt === nothing
-        a_zero = Base.mapreduce_first(f, op, zero(T))
-        out = similar(parent(A), typeof(a_zero), (1,))
-        Strided._init_reduction!(out, f, op, a_zero)
+        ET = Base.Broadcast.combine_eltypes(f, (A,))
+        ET = Base.promote_op(op, ET, ET)
+        (ET === Union{} || ET === Any) &&
+            error("cannot infer output element type for mapreduce; pass an explicit `init`")
+        init = GPUArrays.neutral_element(op, ET)
     else
-        out = similar(parent(A), typeof(nt.init), (1,))
-        fill!(out, nt.init)
+        ET = typeof(nt.init)
+        init = nt.init
     end
 
+    out = similar(parent(A), ET, (1,))
+    fill!(out, init)
+
     Strided._mapreducedim!(f, op, nothing, dims, (sreshape(StridedView(out), one.(dims)), A))
 
     return Array(out)[1]

From 7b30822fd3df4e82986848e363b2cf0810af429b Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sun, 15 Mar 2026 09:57:57 -0400
Subject: [PATCH 05/17] Extend GPU mapreduce tests with nontrivial strides and
 offsets

Add 7 new testsets covering:
- map! reading from stride-2 input (every other row)
- map! writing into stride-2 output, checking untouched rows stay zero
- map! on a subview with nonzero offset (2:6, 3:6 slice)
- map! with permuted (transposed) strides via permutedims
- sum over dim 1 with stride-2 input
- sum over dim 2 with offset subview
- full scalar reduction on stride-2 and offset subviews

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 test/mapreduce_gpu.jl | 73 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl
index c959c4b..99b1f90 100644
--- a/test/mapreduce_gpu.jl
+++ b/test/mapreduce_gpu.jl
@@ -44,3 +44,76 @@ end
         @test result ≈ sum(Array(A))
     end
 end
+
+# ---- nontrivial strides and offsets ----
+
+@testset "GPU map! — stride-2 input (every other row)" begin
+    for T in (Float32, Float64, ComplexF32, ComplexF64)
+        A = JLArray(rand(T, 8, 6))
+        B = JLArray(zeros(T, 4, 6))
+        src = StridedView(A)[1:2:8, :]   # stride 2 in dim 1
+        map!(x -> 2x, StridedView(B), src)
+        @test Array(B) ≈ 2 .* Array(A)[1:2:8, :]
+    end
+end
+
+@testset "GPU map! — stride-2 output (every other row)" begin
+    for T in (Float32, Float64, ComplexF32, ComplexF64)
+        A = JLArray(rand(T, 4, 6))
+        B = JLArray(zeros(T, 8, 6))
+        dst = StridedView(B)[1:2:8, :]   # stride 2 in dim 1
+        map!(identity, dst, StridedView(A))
+        @test Array(B)[1:2:8, :] ≈ Array(A)
+        @test all(iszero, Array(B)[2:2:8, :])  # untouched rows stay zero
+    end
+end
+
+@testset "GPU map! — subview with nonzero offset" begin
+    for T in (Float32, Float64, ComplexF32, ComplexF64)
+        A = JLArray(rand(T, 8, 6))
+        B = JLArray(zeros(T, 5, 4))
+        src = StridedView(A)[2:6, 3:6]   # offset = 1 row + 2 cols
+        map!(x -> x + 1, StridedView(B), src)
+        @test Array(B) ≈ Array(A)[2:6, 3:6] .+ 1
+    end
+end
+
+@testset "GPU map! — permuted (transposed) strides" begin
+    for T in (Float32, Float64, ComplexF32, ComplexF64)
+        A = JLArray(rand(T, 6, 8))
+        B = JLArray(zeros(T, 8, 6))
+        src = permutedims(StridedView(A), (2, 1))   # strides (8,1) → (1,6) after permute: 8×6 view
+        map!(identity, StridedView(B), src)
+        @test Array(B) ≈ permutedims(Array(A), (2, 1))
+    end
+end
+
+@testset "GPU sum over dim 1 — stride-2 input" begin
+    for T in (Float32, Float64)
+        A = JLArray(rand(T, 8, 6))
+        B = JLArray(zeros(T, 1, 6))
+        src = StridedView(A)[1:2:8, :]   # 4×6 with stride 2
+        sum!(StridedView(B), src)
+        @test Array(B) ≈ sum(Array(A)[1:2:8, :]; dims = 1)
+    end
+end
+
+@testset "GPU sum over dim 2 — subview with offset" begin
+    for T in (Float32, Float64)
+        A = JLArray(rand(T, 8, 6))
+        B = JLArray(zeros(T, 5, 1))
+        src = StridedView(A)[2:6, 2:5]   # 5×4, offset = 1 row + 1 col
+        sum!(StridedView(B), src)
+        @test Array(B) ≈ sum(Array(A)[2:6, 2:5]; dims = 2)
+    end
+end
+
+@testset "GPU full scalar reduction — stride-2 and offset subview" begin
+    for T in (Float32, Float64)
+        A = JLArray(rand(T, 8, 6))
+        r1 = sum(StridedView(A)[1:2:8, :])    # stride-2
+        @test r1 ≈ sum(Array(A)[1:2:8, :])
+        r2 = sum(StridedView(A)[3:6, 2:5])    # offset subview
+        @test r2 ≈ sum(Array(A)[3:6, 2:5])
+    end
+end

From 4f66facf5bf60bd20ae0f8d5d2ae0a690ae8486b Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sun, 15 Mar 2026 10:21:31 -0400
Subject: [PATCH 06/17] Restrict GPU _mapreduce_fuse! dispatch to all-GPU input
 arrays

Change Vararg{StridedView} to Vararg{StridedView{<:Any, N, <:AnyGPUArray}}
so the GPU kernel is only dispatched when every input (not just the output)
is GPU-backed. Mixed CPU/GPU calls fall through to the CPU path.

Add a test confirming the GPU path is bypassed for mixed inputs: the CPU
fallback's scalar GPU indexing guard fires, proving the GPU kernel was
not called.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ext/StridedGPUArraysExt.jl |  2 +-
 test/mapreduce_gpu.jl      | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 2023afd..63f6dd2 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -142,7 +142,7 @@ end
 function Strided._mapreduce_fuse!(
         f, op, initop,
         dims::Dims{N},
-        arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView}}
+        arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView{<:Any, N, <:AnyGPUArray}}}
     ) where {TO, N}
 
     out = arrays[1]
diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl
index 99b1f90..aae6471 100644
--- a/test/mapreduce_gpu.jl
+++ b/test/mapreduce_gpu.jl
@@ -47,6 +47,16 @@ end
 
 # ---- nontrivial strides and offsets ----
 
+@testset "GPU dispatch requires all inputs on GPU" begin
+    # With a CPU input the GPU _mapreduce_fuse! must not be dispatched.
+    # The CPU fallback fires instead; since the output is GPU-backed it hits
+    # JLArrays' scalar-indexing guard — confirming the GPU path was bypassed.
+    A_gpu = JLArray(rand(Float32, 4, 4))
+    A_cpu = Array(A_gpu)
+    B_gpu = JLArray(zeros(Float32, 4, 4))
+    @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu))
+end
+
 @testset "GPU map! — stride-2 input (every other row)" begin
     for T in (Float32, Float64, ComplexF32, ComplexF64)
         A = JLArray(rand(T, 8, 6))

From 32b66fb349e73ae4551d1f07e2a05d8ac3922fed Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sun, 15 Mar 2026 10:54:27 -0400
Subject: [PATCH 07/17] Introduce GPUStridedView type alias to reduce verbosity

Define const GPUStridedView{T,N} = StridedView{T, N, <:AnyGPUArray{T}}
and use it throughout the extension in place of the long-form
StridedView{T, N, <:AnyGPUArray{T}} / StridedView{<:Any, N, <:AnyGPUArray}
annotations on get_backend, BroadcastStyle, __mul!, _mapreduce, and
_mapreduce_fuse!.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ext/StridedGPUArraysExt.jl | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 63f6dd2..a3f5a51 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -7,10 +7,13 @@ using StridedViews: ParentIndex
 
 ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv))
+# StridedView backed by any GPU array type, with element type linked to the parent.
+const GPUStridedView{T, N} = StridedView{T, N, <:AnyGPUArray{T}}
 
-function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}}
-    raw_style = Base.Broadcast.BroadcastStyle(TA)
+KernelAbstractions.get_backend(sv::GPUStridedView) = KernelAbstractions.get_backend(parent(sv))
+
+function Base.Broadcast.BroadcastStyle(gpu_sv::GPUStridedView{T, N}) where {T, N}
+    raw_style = Base.Broadcast.BroadcastStyle(typeof(parent(gpu_sv)))
     return typeof(raw_style)(Val(N)) # sets the dimensionality correctly
 end
 
@@ -36,9 +39,9 @@ function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractG
 end
 
 function Strided.__mul!(
-        C::StridedView{TC, 2, <:AnyGPUArray{TC}},
-        A::StridedView{TA, 2, <:AnyGPUArray{TA}},
-        B::StridedView{TB, 2, <:AnyGPUArray{TB}},
+        C::GPUStridedView{TC, 2},
+        A::GPUStridedView{TA, 2},
+        B::GPUStridedView{TB, 2},
         α::Number, β::Number
     ) where {TC, TA, TB}
     return GPUArrays.generic_matmatmul!(C, A, B, α, β)
@@ -111,7 +114,7 @@ end
 # infer output type via Broadcast machinery, look up the neutral element (errors on
 # unknown ops), fill the output buffer, then read back a single scalar via Array().
 function Strided._mapreduce(
-        f, op, A::StridedView{T, N, <:AnyGPUArray{T}}, nt = nothing
+        f, op, A::GPUStridedView{T, N}, nt = nothing
     ) where {T, N}
     if length(A) == 0
         b = Base.mapreduce_empty(f, op, T)
@@ -142,7 +145,7 @@ end
 function Strided._mapreduce_fuse!(
         f, op, initop,
         dims::Dims{N},
-        arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView{<:Any, N, <:AnyGPUArray}}}
+        arrays::Tuple{GPUStridedView{TO, N}, Vararg{GPUStridedView{<:Any, N}}}
     ) where {TO, N}
 
     out = arrays[1]

From eec30de653e3ad2d61e7f1cc7008ad9b77dbb96b Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 11:57:31 -0400
Subject: [PATCH 08/17] some test reworking

---
 test/mapreduce_gpu.jl   | 129 -----------------
 test/mapreduce_tests.jl | 145 +++++++++++++++++++
 test/othertests.jl      | 310 +++++++++++++++++++++++-----------------
 test/runtests.jl        |   4 +-
 4 files changed, 325 insertions(+), 263 deletions(-)
 delete mode 100644 test/mapreduce_gpu.jl
 create mode 100644 test/mapreduce_tests.jl

diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl
deleted file mode 100644
index aae6471..0000000
--- a/test/mapreduce_gpu.jl
+++ /dev/null
@@ -1,129 +0,0 @@
-using Test, Strided, StridedViews, JLArrays
-
-@testset "GPU map! via StridedView" begin
-    for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A = JLArray(rand(T, 8, 6))
-        B = similar(A)
-        map!(x -> 2x, StridedView(B), StridedView(A))
-        @test Array(B) ≈ 2 .* Array(A)
-    end
-end
-
-@testset "GPU mapreducedim! — sum over dim 1" begin
-    for T in (Float32, Float64)
-        A = JLArray(rand(T, 8, 6))
-        B = JLArray(zeros(T, 1, 6))
-        sum!(StridedView(B), StridedView(A))
-        @test Array(B) ≈ sum(Array(A); dims = 1)
-    end
-end
-
-@testset "GPU mapreducedim! — sum over dim 2" begin
-    for T in (Float32, Float64)
-        A = JLArray(rand(T, 8, 6))
-        B = JLArray(zeros(T, 8, 1))
-        sum!(StridedView(B), StridedView(A))
-        @test Array(B) ≈ sum(Array(A); dims = 2)
-    end
-end
-
-@testset "GPU map! with conj/adjoint StridedView" begin
-    for T in (ComplexF32, ComplexF64)
-        A = JLArray(rand(T, 4, 4))
-        B = JLArray(zeros(T, 4, 4))
-        copy!(adjoint(StridedView(B)), StridedView(A))
-        @test Array(B) ≈ adjoint(Array(A))
-    end
-end
-
-@testset "GPU mapreduce — full scalar reduction" begin
-    for T in (Float32, Float64)
-        A = JLArray(rand(T, 8, 6))
-        result = sum(StridedView(A))
-        @test result isa T
-        @test result ≈ sum(Array(A))
-    end
-end
-
-# ---- nontrivial strides and offsets ----
-
-@testset "GPU dispatch requires all inputs on GPU" begin
-    # With a CPU input the GPU _mapreduce_fuse! must not be dispatched.
-    # The CPU fallback fires instead; since the output is GPU-backed it hits
-    # JLArrays' scalar-indexing guard — confirming the GPU path was bypassed.
-    A_gpu = JLArray(rand(Float32, 4, 4))
-    A_cpu = Array(A_gpu)
-    B_gpu = JLArray(zeros(Float32, 4, 4))
-    @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu))
-end
-
-@testset "GPU map! — stride-2 input (every other row)" begin
-    for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A = JLArray(rand(T, 8, 6))
-        B = JLArray(zeros(T, 4, 6))
-        src = StridedView(A)[1:2:8, :]   # stride 2 in dim 1
-        map!(x -> 2x, StridedView(B), src)
-        @test Array(B) ≈ 2 .* Array(A)[1:2:8, :]
-    end
-end
-
-@testset "GPU map! — stride-2 output (every other row)" begin
-    for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A = JLArray(rand(T, 4, 6))
-        B = JLArray(zeros(T, 8, 6))
-        dst = StridedView(B)[1:2:8, :]   # stride 2 in dim 1
-        map!(identity, dst, StridedView(A))
-        @test Array(B)[1:2:8, :] ≈ Array(A)
-        @test all(iszero, Array(B)[2:2:8, :])  # untouched rows stay zero
-    end
-end
-
-@testset "GPU map! — subview with nonzero offset" begin
-    for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A = JLArray(rand(T, 8, 6))
-        B = JLArray(zeros(T, 5, 4))
-        src = StridedView(A)[2:6, 3:6]   # offset = 1 row + 2 cols
-        map!(x -> x + 1, StridedView(B), src)
-        @test Array(B) ≈ Array(A)[2:6, 3:6] .+ 1
-    end
-end
-
-@testset "GPU map! — permuted (transposed) strides" begin
-    for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A = JLArray(rand(T, 6, 8))
-        B = JLArray(zeros(T, 8, 6))
-        src = permutedims(StridedView(A), (2, 1))   # strides (8,1) → (1,6) after permute: 8×6 view
-        map!(identity, StridedView(B), src)
-        @test Array(B) ≈ permutedims(Array(A), (2, 1))
-    end
-end
-
-@testset "GPU sum over dim 1 — stride-2 input" begin
-    for T in (Float32, Float64)
-        A = JLArray(rand(T, 8, 6))
-        B = JLArray(zeros(T, 1, 6))
-        src = StridedView(A)[1:2:8, :]   # 4×6 with stride 2
-        sum!(StridedView(B), src)
-        @test Array(B) ≈ sum(Array(A)[1:2:8, :]; dims = 1)
-    end
-end
-
-@testset "GPU sum over dim 2 — subview with offset" begin
-    for T in (Float32, Float64)
-        A = JLArray(rand(T, 8, 6))
-        B = JLArray(zeros(T, 5, 1))
-        src = StridedView(A)[2:6, 2:5]   # 5×4, offset = 1 row + 1 col
-        sum!(StridedView(B), src)
-        @test Array(B) ≈ sum(Array(A)[2:6, 2:5]; dims = 2)
-    end
-end
-
-@testset "GPU full scalar reduction — stride-2 and offset subview" begin
-    for T in (Float32, Float64)
-        A = JLArray(rand(T, 8, 6))
-        r1 = sum(StridedView(A)[1:2:8, :])    # stride-2
-        @test r1 ≈ sum(Array(A)[1:2:8, :])
-        r2 = sum(StridedView(A)[3:6, 2:5])    # offset subview
-        @test r2 ≈ sum(Array(A)[3:6, 2:5])
-    end
-end
diff --git a/test/mapreduce_tests.jl b/test/mapreduce_tests.jl
new file mode 100644
index 0000000..46b3b34
--- /dev/null
+++ b/test/mapreduce_tests.jl
@@ -0,0 +1,145 @@
+# Parameterized mapreduce / map! tests.
+# Iterates over both Array and JLArray backends internally.
+
+backends = [("Array", identity), ("JLArray", JLArray)]
+
+for (backend_name, make_arr) in backends
+    @testset "$backend_name: map! via StridedView" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            A = make_arr(rand(T, 8, 6))
+            B = similar(A)
+            map!(x -> 2x, StridedView(B), StridedView(A))
+            @test Array(StridedView(B)) ≈ 2 .* Array(StridedView(A))
+        end
+    end
+
+    @testset "$backend_name: mapreducedim! — sum over dim 1" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 1, 6))
+            sum!(StridedView(B), StridedView(A))
+            @test Array(StridedView(B)) ≈ sum(data; dims = 1)
+        end
+    end
+
+    @testset "$backend_name: mapreducedim! — sum over dim 2" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 8, 1))
+            sum!(StridedView(B), StridedView(A))
+            @test Array(StridedView(B)) ≈ sum(data; dims = 2)
+        end
+    end
+
+    @testset "$backend_name: map! with conj/adjoint StridedView" begin
+        for T in (ComplexF32, ComplexF64)
+            data = rand(T, 4, 4)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 4, 4))
+            copy!(adjoint(StridedView(B)), StridedView(A))
+            @test Array(StridedView(B)) ≈ adjoint(data)
+        end
+    end
+
+    @testset "$backend_name: mapreduce — full scalar reduction" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            result = sum(StridedView(A))
+            @test result isa T
+            @test result ≈ sum(data)
+        end
+    end
+
+    # Only meaningful for GPU backends: mixing CPU and GPU inputs must not silently
+    # use the GPU dispatch path.
+    if make_arr !== identity
+        @testset "$backend_name: dispatch requires all inputs on GPU" begin
+            A_gpu = make_arr(rand(Float32, 4, 4))
+            A_cpu = Array(StridedView(A_gpu))
+            B_gpu = make_arr(zeros(Float32, 4, 4))
+            @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu))
+        end
+    end
+
+    @testset "$backend_name: map! — stride-2 input (every other row)" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 4, 6))
+            src = StridedView(A)[1:2:8, :]
+            map!(x -> 2x, StridedView(B), src)
+            @test Array(StridedView(B)) ≈ 2 .* data[1:2:8, :]
+        end
+    end
+
+    @testset "$backend_name: map! — stride-2 output (every other row)" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 4, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 8, 6))
+            dst = StridedView(B)[1:2:8, :]
+            map!(identity, dst, StridedView(A))
+            B_cpu = Array(StridedView(B))
+            @test B_cpu[1:2:8, :] ≈ data
+            @test all(iszero, B_cpu[2:2:8, :])
+        end
+    end
+
+    @testset "$backend_name: map! — subview with nonzero offset" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 5, 4))
+            src = StridedView(A)[2:6, 3:6]
+            map!(x -> x + 1, StridedView(B), src)
+            @test Array(StridedView(B)) ≈ data[2:6, 3:6] .+ 1
+        end
+    end
+
+    @testset "$backend_name: map! — permuted (transposed) strides" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 6, 8)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 8, 6))
+            src = permutedims(StridedView(A), (2, 1))
+            map!(identity, StridedView(B), src)
+            @test Array(StridedView(B)) ≈ permutedims(data, (2, 1))
+        end
+    end
+
+    @testset "$backend_name: sum over dim 1 — stride-2 input" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 1, 6))
+            src = StridedView(A)[1:2:8, :]
+            sum!(StridedView(B), src)
+            @test Array(StridedView(B)) ≈ sum(data[1:2:8, :]; dims = 1)
+        end
+    end
+
+    @testset "$backend_name: sum over dim 2 — subview with offset" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 5, 1))
+            src = StridedView(A)[2:6, 2:5]
+            sum!(StridedView(B), src)
+            @test Array(StridedView(B)) ≈ sum(data[2:6, 2:5]; dims = 2)
+        end
+    end
+
+    @testset "$backend_name: full scalar reduction — stride-2 and offset subview" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            r1 = sum(StridedView(A)[1:2:8, :])
+            @test r1 ≈ sum(data[1:2:8, :])
+            r2 = sum(StridedView(A)[3:6, 2:5])
+            @test r2 ≈ sum(data[3:6, 2:5])
+        end
+    end
+end
diff --git a/test/othertests.jl b/test/othertests.jl
index 8c775ce..223eb0f 100644
--- a/test/othertests.jl
+++ b/test/othertests.jl
@@ -1,157 +1,203 @@
+backends = [("Array", identity), ("JLArray", JLArray)]
+
 @testset "in-place matrix operations" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A1 = randn(T, (1000, 1000))
-        A2 = similar(A1)
-        A1c = copy(A1)
-        A2c = copy(A2)
-        B1 = StridedView(A1c)
-        B2 = StridedView(A2c)
-
-        @test conj!(A1) == conj!(B1)
-        @test adjoint!(A2, A1) == adjoint!(B2, B1)
-        @test transpose!(A2, A1) == transpose!(B2, B1)
-        @test permutedims!(A2, A1, (2, 1)) == permutedims!(B2, B1, (2, 1))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data1 = randn(T, (1000, 1000))
+            data2 = randn(T, (1000, 1000))
+            # CPU reference
+            A1 = copy(data1); A2 = copy(data2)
+            # Backend under test
+            B1 = StridedView(make_arr(copy(data1)))
+            B2 = StridedView(make_arr(copy(data2)))
+
+            conj!(A1);                        conj!(B1)
+            @test A1 ≈ Array(B1)
+            adjoint!(A2, A1);                 adjoint!(B2, B1)
+            @test A2 ≈ Array(B2)
+            transpose!(A2, A1);               transpose!(B2, B1)
+            @test A2 ≈ Array(B2)
+            permutedims!(A2, A1, (2, 1));     permutedims!(B2, B1, (2, 1))
+            @test A2 ≈ Array(B2)
+        end
     end
 end
 
 @testset "map, scale!, axpy! and axpby! with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        @testset for N in 2:6
-            dims = ntuple(n -> div(60, N), N)
-            R1, R2, R3 = rand(T, dims), rand(T, dims), rand(T, dims)
-            B1 = permutedims(StridedView(R1), randperm(N))
-            B2 = permutedims(StridedView(R2), randperm(N))
-            B3 = permutedims(StridedView(R3), randperm(N))
-            A1 = convert(Array, B1)
-            A2 = convert(Array{T}, B2) # test different converts
-            A3 = convert(Array{T, N}, B3)
-            C1 = deepcopy(B1)
-
-            @test rmul!(B1, 1 // 2) ≈ rmul!(A1, 1 // 2)
-            @test lmul!(1 // 3, B2) ≈ lmul!(1 // 3, A2)
-            @test axpy!(1 // 3, B1, B2) ≈ axpy!(1 // 3, A1, A2)
-            @test axpy!(1, B2, B3) ≈ axpy!(1, A2, A3)
-            @test axpby!(1 // 3, B1, 1 // 2, B3) ≈ axpby!(1 // 3, A1, 1 // 2, A3)
-            @test axpby!(1, B2, 1, B1) ≈ axpby!(1, A2, 1, A1)
-            @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) ≈
-                map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3)
-            @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView
-            @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array
-            @test mul!(B1, 1, B2) ≈ mul!(A1, 1, A2)
-            @test mul!(B1, B2, 1) ≈ mul!(A1, A2, 1)
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            @testset for N in 2:6
+                dims = ntuple(n -> div(60, N), N)
+                perm1, perm2, perm3 = randperm(N), randperm(N), randperm(N)
+                R1_cpu, R2_cpu, R3_cpu = rand(T, dims), rand(T, dims), rand(T, dims)
+                R1 = make_arr(copy(R1_cpu))
+                R2 = make_arr(copy(R2_cpu))
+                R3 = make_arr(copy(R3_cpu))
+                B1 = permutedims(StridedView(R1), perm1)
+                B2 = permutedims(StridedView(R2), perm2)
+                B3 = permutedims(StridedView(R3), perm3)
+                A1 = Array(B1)
+                A2 = Array(B2)
+                A3 = Array(B3)
+
+                @test Array(rmul!(B1, 1 // 2)) ≈ rmul!(A1, 1 // 2)
+                @test Array(lmul!(1 // 3, B2)) ≈ lmul!(1 // 3, A2)
+                @test Array(axpy!(1 // 3, B1, B2)) ≈ axpy!(1 // 3, A1, A2)
+                @test Array(axpy!(1, B2, B3)) ≈ axpy!(1, A2, A3)
+                @test Array(axpby!(1 // 3, B1, 1 // 2, B3)) ≈ axpby!(1 // 3, A1, 1 // 2, A3)
+                @test Array(axpby!(1, B2, 1, B1)) ≈ axpby!(1, A2, 1, A1)
+                @test Array(map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3)) ≈
+                    map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3)
+                @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView
+                if make_arr === identity
+                    @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array
+                end
+                @test Array(mul!(B1, 1, B2)) ≈ mul!(A1, 1, A2)
+                @test Array(mul!(B1, B2, 1)) ≈ mul!(A1, A2, 1)
+            end
         end
     end
 end
 
 @testset "broadcast with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        R1, R2, R3 = rand(T, (10,)), rand(T, (10, 10)), rand(T, (10, 10, 10))
-        B1 = StridedView(R1)
-        B2 = permutedims(StridedView(R2), randperm(2))
-        B3 = permutedims(StridedView(R3), randperm(3))
-        A1 = convert(Array, B1)
-        A2 = convert(Array{T}, B2)
-        A3 = convert(Array{T, 3}, B3)
-
-        @test @inferred(B1 .+ sin.(B2 .- 3)) ≈ A1 .+ sin.(A2 .- 3)
-        @test @inferred(B2' .* B3 .- Ref(0.5)) ≈ A2' .* A3 .- Ref(0.5)
-        @test @inferred(B2' .* B3 .- max.(abs.(B1), real.(B3))) ≈
-            A2' .* A3 .- max.(abs.(A1), real.(A3))
-
-        @test (B1 .+ sin.(B2 .- 3)) isa StridedView
-        @test (B2' .* B3 .- Ref(0.5)) isa StridedView
-        @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView
-        @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            R1_cpu = rand(T, (10,))
+            R2_cpu = rand(T, (10, 10))
+            R3_cpu = rand(T, (10, 10, 10))
+            perm2, perm3 = randperm(2), randperm(3)
+            R1 = make_arr(copy(R1_cpu))
+            R2 = make_arr(copy(R2_cpu))
+            R3 = make_arr(copy(R3_cpu))
+            B1 = StridedView(R1)
+            B2 = permutedims(StridedView(R2), perm2)
+            B3 = permutedims(StridedView(R3), perm3)
+            A1 = Array(B1)
+            A2 = Array(B2)
+            A3 = Array(B3)
+
+            @test Array(@inferred(B1 .+ sin.(B2 .- 3))) ≈ A1 .+ sin.(A2 .- 3)
+            @test Array(@inferred(B2' .* B3 .- Ref(0.5))) ≈ A2' .* A3 .- Ref(0.5)
+            @test Array(@inferred(B2' .* B3 .- max.(abs.(B1), real.(B3)))) ≈
+                A2' .* A3 .- max.(abs.(A1), real.(A3))
+
+            @test (B1 .+ sin.(B2 .- 3)) isa StridedView
+            @test (B2' .* B3 .- Ref(0.5)) isa StridedView
+            @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView
+            if make_arr === identity
+                @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array
+            end
+        end
     end
 end
 
 @testset "broadcast with zero-length StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A1 = StridedView(zeros(T, (2, 0)))
-        A2 = StridedView(zeros(T, (2, 0)))
-        @test (A1 .+ A2) == StridedView(zeros(T, (2, 0)))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            A1 = StridedView(make_arr(zeros(T, (2, 0))))
+            A2 = StridedView(make_arr(zeros(T, (2, 0))))
+            @test Array(A1 .+ A2) == zeros(T, (2, 0))
+        end
     end
 end
 
 @testset "mapreduce with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        R1 = rand(T, (10, 10, 10, 10, 10, 10))
-        @test sum(R1; dims = (1, 3, 5)) ≈ sum(StridedView(R1); dims = (1, 3, 5))
-        @test mapreduce(sin, +, R1; dims = (1, 3, 5)) ≈
-            mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5))
-        R2 = rand(T, (10, 10, 10))
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, identity, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ reshape(R2, (10, 1, 1, 10, 10, 1))
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, x -> 0, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6))
-        R2c = copy(R2)
-        β = rand(T)
-        @test Strided._mapreducedim!(
-            sin, +, x -> β * x, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6)) .+
-            β .* reshape(R2, (10, 1, 1, 10, 10, 1))
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, x -> β, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6), init = β)
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, conj, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6)) .+
-            conj.(reshape(R2, (10, 1, 1, 10, 10, 1)))
-
-        R3 = rand(T, (100, 100, 2))
-        @test sum(R3; dims = (1, 2)) ≈ sum(StridedView(R3); dims = (1, 2))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            R1_cpu = rand(T, (10, 10, 10, 10, 10, 10))
+            R2_cpu = rand(T, (10, 10, 10))
+            R1 = make_arr(copy(R1_cpu))
+            R2 = make_arr(copy(R2_cpu))
+
+            @test sum(StridedView(R1); dims = (1, 3, 5)) isa StridedView
+            @test Array(sum(StridedView(R1); dims = (1, 3, 5))) ≈ sum(R1_cpu; dims = (1, 3, 5))
+            @test Array(mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5))) ≈
+                mapreduce(sin, +, R1_cpu; dims = (1, 3, 5))
+
+            R2c = copy(R2)
+            @test Array(Strided._mapreducedim!(
+                sin, +, identity, (10, 10, 10, 10, 10, 10),
+                (
+                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                    StridedView(R1),
+                )
+            )) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ reshape(R2_cpu, (10, 1, 1, 10, 10, 1))
+
+            R2c = copy(R2)
+            @test Array(Strided._mapreducedim!(
+                sin, +, x -> 0, (10, 10, 10, 10, 10, 10),
+                (
+                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                    StridedView(R1),
+                )
+            )) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6))
+
+            R2c = copy(R2)
+            β = rand(T)
+            @test Array(Strided._mapreducedim!(
+                sin, +, x -> β * x, (10, 10, 10, 10, 10, 10),
+                (
+                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                    StridedView(R1),
+                )
+            )) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+
+                β .* reshape(R2_cpu, (10, 1, 1, 10, 10, 1))
+
+            R2c = copy(R2)
+            @test Array(Strided._mapreducedim!(
+                sin, +, x -> β, (10, 10, 10, 10, 10, 10),
+                (
+                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                    StridedView(R1),
+                )
+            )) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6), init = β)
+
+            R2c = copy(R2)
+            @test Array(Strided._mapreducedim!(
+                sin, +, conj, (10, 10, 10, 10, 10, 10),
+                (
+                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                    StridedView(R1),
+                )
+            )) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+
+                conj.(reshape(R2_cpu, (10, 1, 1, 10, 10, 1)))
+
+            R3_cpu = rand(T, (100, 100, 2))
+            R3 = make_arr(copy(R3_cpu))
+            @test Array(sum(StridedView(R3); dims = (1, 2))) ≈ sum(R3_cpu; dims = (1, 2))
+        end
     end
 end
 
 @testset "complete reductions with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        R1 = rand(T, (10, 10, 10, 10, 10, 10))
-
-        @test sum(R1) ≈ sum(StridedView(R1))
-        @test maximum(abs, R1) ≈ maximum(abs, StridedView(R1))
-        @test minimum(real, R1) ≈ minimum(real, StridedView(R1))
-        @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R1))
-
-        R2 = PermutedDimsArray(R1, (randperm(6)...,))
-
-        @test sum(R2) ≈ sum(StridedView(R2))
-        @test maximum(abs, R2) ≈ maximum(abs, StridedView(R2))
-        @test minimum(real, R2) ≈ minimum(real, StridedView(R2))
-        @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R2))
-
-        R3 = rand(T, (5, 5, 5))
-        @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3)))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            R1_cpu = rand(T, (10, 10, 10, 10, 10, 10))
+            R1 = make_arr(copy(R1_cpu))
+
+            @test sum(StridedView(R1)) ≈ sum(R1_cpu)
+            @test maximum(abs, StridedView(R1)) ≈ maximum(abs, R1_cpu)
+            @test minimum(real, StridedView(R1)) ≈ minimum(real, R1_cpu)
+            @test sum(x -> real(x) < 0, StridedView(R1)) == sum(x -> real(x) < 0, R1_cpu)
+
+            perm = (randperm(6)...,)
+            R2_cpu = PermutedDimsArray(R1_cpu, perm)
+            R2 = PermutedDimsArray(R1, perm)
+
+            @test sum(StridedView(R2)) ≈ sum(R2_cpu)
+            @test maximum(abs, StridedView(R2)) ≈ maximum(abs, R2_cpu)
+            @test minimum(real, StridedView(R2)) ≈ minimum(real, R2_cpu)
+            @test sum(x -> real(x) < 0, StridedView(R2)) == sum(x -> real(x) < 0, R1_cpu)
+
+            R3_cpu = rand(T, (5, 5, 5))
+            R3 = make_arr(copy(R3_cpu))
+            @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3)))
+        end
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index a26b337..3607bee 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,8 +12,8 @@ is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
 
 if !is_buildkite
     include("jlarrays.jl")
-    @testset "JLArray GPU mapreduce" begin
-        include("mapreduce_gpu.jl")
+    @testset "mapreduce tests" begin
+        include("mapreduce_tests.jl")
     end
     println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
 

From 936c38e46624feb0622d51a0e69340044a613c35 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 11:57:57 -0400
Subject: [PATCH 09/17] hijack some more linearalgebra methods

---
 src/linalg.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/linalg.jl b/src/linalg.jl
index 5b054ca..134db79 100644
--- a/src/linalg.jl
+++ b/src/linalg.jl
@@ -2,6 +2,13 @@
 LinearAlgebra.rmul!(dst::StridedView, α::Number) = mul!(dst, dst, α)
 LinearAlgebra.lmul!(α::Number, dst::StridedView) = mul!(dst, α, dst)
 
+LinearAlgebra.adjoint!(C::StridedView, A::StridedView) = copy!(C, adjoint(A))
+LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A))
+function Base.permutedims!(C::StridedView{T, N}, A::StridedView{T, N}, perm) where {T, N}
+    copy!(C, permutedims(A, perm))
+    return C
+end
+
 function LinearAlgebra.mul!(
         dst::StridedView{<:Number, N}, α::Number,
         src::StridedView{<:Number, N}

From c935288174437bef4e336e8302a63eb975c2ab26 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 11:58:06 -0400
Subject: [PATCH 10/17] bypass something

---
 ext/StridedGPUArraysExt.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index a3f5a51..a2e4c34 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -24,6 +24,14 @@ function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS
     return dst
 end
 
+# Conversion to CPU Array: materialise into a contiguous GPU array first (so the
+# GPU-to-GPU copy! path is used), then let the GPU array type handle the transfer.
+function Base.Array(a::GPUStridedView{T, N}) where {T, N}
+    b = similar(parent(a), T, size(a))
+    copy!(StridedView(b), a)
+    return Array(b)
+end
+
 # lifted from GPUArrays.jl
 function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS}
     isempty(A) && return A

From 42bb19b2afdc911950aea435a58befbe982c8fc2 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 13:17:03 -0400
Subject: [PATCH 11/17] correctly allocate output type

---
 src/broadcast.jl | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/broadcast.jl b/src/broadcast.jl
index b480a70..bc816b8 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -20,8 +20,21 @@ function Broadcast.BroadcastStyle(
 end
 
 function Base.similar(bc::Broadcasted{<:StridedArrayStyle{N}}, ::Type{T}) where {N, T}
-    return StridedView(similar(convert(Broadcasted{DefaultArrayStyle{N}}, bc), T))
+    sv = _find_strided_view(bc)
+    if sv !== nothing
+        return StridedView(similar(parent(sv), T, size(bc)))
+    end
+    return StridedView(similar(Array{T}, axes(bc)))
+end
+
+@inline _find_strided_view(bc::Broadcasted) = _find_strided_view(bc.args...)
+@inline _find_strided_view(sv::StridedView, rest...) = sv
+@inline function _find_strided_view(nested::Broadcasted, rest...)
+    sv = _find_strided_view(nested)
+    sv === nothing ? _find_strided_view(rest...) : sv
 end
+@inline _find_strided_view(x, rest...) = _find_strided_view(rest...)
+@inline _find_strided_view() = nothing
 
 Base.dotview(a::StridedView{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} = getindex(a, I...)
 

From 5f991eeee72dca4d30f78543f0a3aeedfca77146 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 13:17:32 -0400
Subject: [PATCH 12/17] formatter

---
 ext/StridedGPUArraysExt.jl | 32 ++++++++++-------
 src/broadcast.jl           |  2 +-
 test/othertests.jl         | 70 ++++++++++++++++++++++----------------
 3 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index a2e4c34..2e7a8d3 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -99,17 +99,21 @@ end
     # Sequential reduction loop over reduction subspace
     @inbounds for red_linear in Base.OneTo(prod(red_sizes))
         red_cidx = CartesianIndices(red_sizes)[red_linear]
-        complete_cidx = CartesianIndex(ntuple(Val(N)) do d
-            @inbounds nred_cidx[d] + red_cidx[d] - 1
-        end)
-
-        val = f(ntuple(Val(length(inputs))) do m
-            @inbounds begin
-                a = inputs[m]
-                ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx)
-                a[ParentIndex(ip)]
+        complete_cidx = CartesianIndex(
+            ntuple(Val(N)) do d
+                @inbounds nred_cidx[d] + red_cidx[d] - 1
             end
-        end...)
+        )
+
+        val = f(
+            ntuple(Val(length(inputs))) do m
+                @inbounds begin
+                    a = inputs[m]
+                    ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx)
+                    a[ParentIndex(ip)]
+                end
+            end...
+        )
 
         acc = _gpu_accum(op, acc, val)
     end
@@ -162,9 +166,11 @@ function Strided._mapreduce_fuse!(
     inputs = ntuple(i -> inputs_raw[i], Val(M))
 
     # Number of output elements = product of non-reduction dims
-    out_total = prod(ntuple(Val(N)) do d
-        @inbounds iszero(out.strides[d]) ? 1 : dims[d]
-    end)
+    out_total = prod(
+        ntuple(Val(N)) do d
+            @inbounds iszero(out.strides[d]) ? 1 : dims[d]
+        end
+    )
 
     backend = KernelAbstractions.get_backend(parent(out))
     kernel! = _mapreduce_gpu_kernel!(backend)
diff --git a/src/broadcast.jl b/src/broadcast.jl
index bc816b8..b3151c9 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -31,7 +31,7 @@ end
 @inline _find_strided_view(sv::StridedView, rest...) = sv
 @inline function _find_strided_view(nested::Broadcasted, rest...)
     sv = _find_strided_view(nested)
-    sv === nothing ? _find_strided_view(rest...) : sv
+    return sv === nothing ? _find_strided_view(rest...) : sv
 end
 @inline _find_strided_view(x, rest...) = _find_strided_view(rest...)
 @inline _find_strided_view() = nothing
diff --git a/test/othertests.jl b/test/othertests.jl
index 223eb0f..ea5fee1 100644
--- a/test/othertests.jl
+++ b/test/othertests.jl
@@ -115,55 +115,65 @@ end
                 mapreduce(sin, +, R1_cpu; dims = (1, 3, 5))
 
             R2c = copy(R2)
-            @test Array(Strided._mapreducedim!(
-                sin, +, identity, (10, 10, 10, 10, 10, 10),
-                (
-                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                    StridedView(R1),
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, identity, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
                 )
-            )) ≈
+            ) ≈
                 mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ reshape(R2_cpu, (10, 1, 1, 10, 10, 1))
 
             R2c = copy(R2)
-            @test Array(Strided._mapreducedim!(
-                sin, +, x -> 0, (10, 10, 10, 10, 10, 10),
-                (
-                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                    StridedView(R1),
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, x -> 0, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
                 )
-            )) ≈
+            ) ≈
                 mapreduce(sin, +, R1_cpu; dims = (2, 3, 6))
 
             R2c = copy(R2)
             β = rand(T)
-            @test Array(Strided._mapreducedim!(
-                sin, +, x -> β * x, (10, 10, 10, 10, 10, 10),
-                (
-                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                    StridedView(R1),
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, x -> β * x, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
                 )
-            )) ≈
+            ) ≈
                 mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+
                 β .* reshape(R2_cpu, (10, 1, 1, 10, 10, 1))
 
             R2c = copy(R2)
-            @test Array(Strided._mapreducedim!(
-                sin, +, x -> β, (10, 10, 10, 10, 10, 10),
-                (
-                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                    StridedView(R1),
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, x -> β, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
                 )
-            )) ≈
+            ) ≈
                 mapreduce(sin, +, R1_cpu; dims = (2, 3, 6), init = β)
 
             R2c = copy(R2)
-            @test Array(Strided._mapreducedim!(
-                sin, +, conj, (10, 10, 10, 10, 10, 10),
-                (
-                    sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                    StridedView(R1),
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, conj, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
                 )
-            )) ≈
+            ) ≈
                 mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+
                 conj.(reshape(R2_cpu, (10, 1, 1, 10, 10, 1)))
 

From 87a801ceba2390abab860946d9f4778639e6905e Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 14:21:56 -0400
Subject: [PATCH 13/17] remove duplicate definitions

---
 src/linalg.jl    |  7 -------
 src/mapreduce.jl | 15 +++------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/linalg.jl b/src/linalg.jl
index 134db79..5b054ca 100644
--- a/src/linalg.jl
+++ b/src/linalg.jl
@@ -2,13 +2,6 @@
 LinearAlgebra.rmul!(dst::StridedView, α::Number) = mul!(dst, dst, α)
 LinearAlgebra.lmul!(α::Number, dst::StridedView) = mul!(dst, α, dst)
 
-LinearAlgebra.adjoint!(C::StridedView, A::StridedView) = copy!(C, adjoint(A))
-LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A))
-function Base.permutedims!(C::StridedView{T, N}, A::StridedView{T, N}, perm) where {T, N}
-    copy!(C, permutedims(A, perm))
-    return C
-end
-
 function LinearAlgebra.mul!(
         dst::StridedView{<:Number, N}, α::Number,
         src::StridedView{<:Number, N}
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 8bfd7d9..a7d2f55 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -4,18 +4,9 @@ function Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) wher
 end
 Base.conj!(a::StridedView{<:Real}) = a
 Base.conj!(a::StridedView) = map!(conj, a, a)
-function LinearAlgebra.adjoint!(
-        dst::StridedView{<:Any, N},
-        src::StridedView{<:Any, N}
-    ) where {N}
-    return copy!(dst, adjoint(src))
-end
-function Base.permutedims!(
-        dst::StridedView{<:Any, N}, src::StridedView{<:Any, N},
-        p
-    ) where {N}
-    return copy!(dst, permutedims(src, p))
-end
+LinearAlgebra.adjoint!(dst::StridedView, src::StridedView) = copy!(dst, adjoint(src))
+LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A))
+Base.permutedims!(dst::StridedView, src::StridedView, p) = copy!(dst, permutedims(src, p))
 
 function Base.mapreduce(f, op, A::StridedView; dims = :, kw...)
     return Base._mapreduce_dim(f, op, values(kw), A, dims)

From 971056cf603a47cf4152beab50577bcecb3b7b73 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 14:25:43 -0400
Subject: [PATCH 14/17] add Metal to test deps

---
 Project.toml | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index f6bf082..8774c8b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,23 +10,24 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
-GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 
 [extensions]
 StridedAMDGPUExt = "AMDGPU"
-StridedJLArraysExt = "JLArrays"
-StridedGPUArraysExt = "GPUArrays"
 StridedCUDAExt = "CUDA"
+StridedGPUArraysExt = "GPUArrays"
+StridedJLArraysExt = "JLArrays"
 
 [compat]
 AMDGPU = "2"
 Aqua = "0.8"
 CUDA = "5"
-JLArrays = "0.3.1"
 GPUArrays = "11.4.1"
+JLArrays = "0.3.1"
 LinearAlgebra = "1.6"
+Metal = "1.9"
 Random = "1.6"
 StridedViews = "0.4.6"
 Test = "1.6"
@@ -37,10 +38,11 @@ julia = "1.6"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays"]
+test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays", "Metal"]

From 3872de423872755edb44a5943dcf1800a8863a27 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 16:35:11 -0400
Subject: [PATCH 15/17] remove fill! specialization

---
 ext/StridedGPUArraysExt.jl | 14 --------------
 src/mapreduce.jl           |  5 ++---
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 2e7a8d3..149c476 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -32,20 +32,6 @@ function Base.Array(a::GPUStridedView{T, N}) where {T, N}
     return Array(b)
 end
 
-# lifted from GPUArrays.jl
-function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS}
-    isempty(A) && return A
-    @kernel function fill_kernel!(a, val)
-        idx = @index(Global, Cartesian)
-        @inbounds a[idx] = val
-    end
-    # ndims check for 0D support
-    kernel = fill_kernel!(KernelAbstractions.get_backend(A))
-    f_x = F <: Union{typeof(conj), typeof(adjoint)} ? conj(x) : x
-    kernel(A, f_x; ndrange = size(A))
-    return A
-end
-
 function Strided.__mul!(
         C::GPUStridedView{TC, 2},
         A::GPUStridedView{TA, 2},
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index a7d2f55..c60aa5e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -1,12 +1,11 @@
 # Methods based on map!
-function Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N}
-    return map!(identity, dst, src)
-end
+Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N} = map!(identity, dst, src)
 Base.conj!(a::StridedView{<:Real}) = a
 Base.conj!(a::StridedView) = map!(conj, a, a)
 LinearAlgebra.adjoint!(dst::StridedView, src::StridedView) = copy!(dst, adjoint(src))
 LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A))
 Base.permutedims!(dst::StridedView, src::StridedView, p) = copy!(dst, permutedims(src, p))
+Base.fill!(A::StridedView, val) = map!(Returns(val), A)
 
 function Base.mapreduce(f, op, A::StridedView; dims = :, kw...)
     return Base._mapreduce_dim(f, op, values(kw), A, dims)

From 90521ee9ace9fb3900b64375d257f8fe134a12ce Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 16:36:12 -0400
Subject: [PATCH 16/17] remove some GPU specializations

---
 ext/StridedGPUArraysExt.jl | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 149c476..e4f58a1 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -12,22 +12,10 @@ const GPUStridedView{T, N} = StridedView{T, N, <:AnyGPUArray{T}}
 
 KernelAbstractions.get_backend(sv::GPUStridedView) = KernelAbstractions.get_backend(parent(sv))
 
-function Base.Broadcast.BroadcastStyle(gpu_sv::GPUStridedView{T, N}) where {T, N}
-    raw_style = Base.Broadcast.BroadcastStyle(typeof(parent(gpu_sv)))
-    return typeof(raw_style)(Val(N)) # sets the dimensionality correctly
-end
-
-function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TS <: Number, NS, TAS <: AbstractGPUArray{TS}, FS <: ALL_FS}
-    bc_style = Base.Broadcast.BroadcastStyle(TAS)
-    bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
-    GPUArrays._copyto!(dst, bc)
-    return dst
-end
-
 # Conversion to CPU Array: materialise into a contiguous GPU array first (so the
 # GPU-to-GPU copy! path is used), then let the GPU array type handle the transfer.
-function Base.Array(a::GPUStridedView{T, N}) where {T, N}
-    b = similar(parent(a), T, size(a))
+function Base.Array(a::GPUStridedView)
+    b = similar(parent(a), eltype(a), size(a))
     copy!(StridedView(b), a)
     return Array(b)
 end

From 4f2f364a08516e35c2fc2bccb8cdee23dac9f76f Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Mon, 16 Mar 2026 16:36:17 -0400
Subject: [PATCH 17/17] cleanup tests

---
 test/jlarrays.jl | 19 -------------------
 test/runtests.jl |  1 -
 2 files changed, 20 deletions(-)
 delete mode 100644 test/jlarrays.jl

diff --git a/test/jlarrays.jl b/test/jlarrays.jl
deleted file mode 100644
index 5aceb35..0000000
--- a/test/jlarrays.jl
+++ /dev/null
@@ -1,19 +0,0 @@
-@testset for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
-    @testset "Copy with JLArrayStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
-        for m1 in (0, 16, 32), m2 in (0, 16, 32)
-            A1 = JLArray(randn(T, (m1, m2)))
-            A2 = similar(A1)
-            zA1 = JLArray(f1(zeros(T, (m1, m2))))
-            zA2 = JLArray(f2(zeros(T, (m1, m2))))
-            A1c = copy(A1)
-            A2c = copy(A2)
-            B1 = f1(StridedView(A1c))
-            B2 = f2(StridedView(A2c))
-            axes(f1(A1)) == axes(f2(A2)) || continue
-            @test collect(Matrix(copy!(f2(A2), f1(A1)))) == JLArrays.Adapt.adapt(Vector{T}, copy!(B2, B1))
-            @test copy!(zA1, f1(A1)) == copy!(zA2, B1)
-            x = rand(T)
-            @test f1(StridedView(JLArrays.Adapt.adapt(Vector{T}, fill!(A1c, x)))) == JLArrays.Adapt.adapt(Vector{T}, fill!(B1, x))
-        end
-    end
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 3607bee..80dd88b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,7 +11,6 @@ Random.seed!(1234)
 is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
 
 if !is_buildkite
-    include("jlarrays.jl")
     @testset "mapreduce tests" begin
         include("mapreduce_tests.jl")
     end