diff --git a/Project.toml b/Project.toml
index f6bf082..8774c8b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,23 +10,24 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
-GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 
 [extensions]
 StridedAMDGPUExt = "AMDGPU"
-StridedJLArraysExt = "JLArrays"
-StridedGPUArraysExt = "GPUArrays"
 StridedCUDAExt = "CUDA"
+StridedGPUArraysExt = "GPUArrays"
+StridedJLArraysExt = "JLArrays"
 
 [compat]
 AMDGPU = "2"
 Aqua = "0.8"
 CUDA = "5"
-JLArrays = "0.3.1"
 GPUArrays = "11.4.1"
+JLArrays = "0.3.1"
 LinearAlgebra = "1.6"
+Metal = "1.9"
 Random = "1.6"
 StridedViews = "0.4.6"
 Test = "1.6"
@@ -37,10 +38,11 @@ julia = "1.6"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays"]
+test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays", "Metal"]
diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl
index 608d8b5..e4f58a1 100644
--- a/ext/StridedGPUArraysExt.jl
+++ b/ext/StridedGPUArraysExt.jl
@@ -3,44 +3,154 @@ module StridedGPUArraysExt
 using Strided, GPUArrays
 using GPUArrays: Adapt, KernelAbstractions
 using GPUArrays.KernelAbstractions: @kernel, @index
+using StridedViews: ParentIndex
 
 ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv))
+# StridedView backed by any GPU array type, with element type linked to the parent.
+const GPUStridedView{T, N} = StridedView{T, N, <:AnyGPUArray{T}}
 
-function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}}
-    raw_style = Base.Broadcast.BroadcastStyle(TA)
-    return typeof(raw_style)(Val(N)) # sets the dimensionality correctly
-end
+KernelAbstractions.get_backend(sv::GPUStridedView) = KernelAbstractions.get_backend(parent(sv))
 
-function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TS <: Number, NS, TAS <: AbstractGPUArray{TS}, FS <: ALL_FS}
-    bc_style = Base.Broadcast.BroadcastStyle(TAS)
-    bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
-    GPUArrays._copyto!(dst, bc)
-    return dst
-end
-
-# lifted from GPUArrays.jl
-function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS}
-    isempty(A) && return A
-    @kernel function fill_kernel!(a, val)
-        idx = @index(Global, Cartesian)
-        @inbounds a[idx] = val
-    end
-    # ndims check for 0D support
-    kernel = fill_kernel!(KernelAbstractions.get_backend(A))
-    f_x = F <: Union{typeof(conj), typeof(adjoint)} ? conj(x) : x
-    kernel(A, f_x; ndrange = size(A))
-    return A
+# Conversion to CPU Array: materialise into a contiguous GPU array first (so the
+# GPU-to-GPU copy! path is used), then let the GPU array type handle the transfer.
+function Base.Array(a::GPUStridedView)
+    b = similar(parent(a), eltype(a), size(a))
+    copy!(StridedView(b), a)
+    return Array(b)
 end
 
 function Strided.__mul!(
-        C::StridedView{TC, 2, <:AnyGPUArray{TC}},
-        A::StridedView{TA, 2, <:AnyGPUArray{TA}},
-        B::StridedView{TB, 2, <:AnyGPUArray{TB}},
+        C::GPUStridedView{TC, 2},
+        A::GPUStridedView{TA, 2},
+        B::GPUStridedView{TB, 2},
         α::Number, β::Number
     ) where {TC, TA, TB}
     return GPUArrays.generic_matmatmul!(C, A, B, α, β)
 end
 
+# ---------- GPU mapreduce support ----------
+
+@inline _gpu_init_acc(::Nothing, current_val) = current_val
+@inline _gpu_init_acc(initop, current_val) = initop(current_val)
+
+@inline _gpu_accum(::Nothing, acc, val) = val
+@inline _gpu_accum(op, acc, val) = op(acc, val)
+
+@inline function _strides_dot(strides::NTuple{N, Int}, cidx::CartesianIndex{N}) where {N}
+    s = 0
+    for d in Base.OneTo(N)
+        @inbounds s += strides[d] * (cidx[d] - 1)
+    end
+    return s
+end
+
+@kernel function _mapreduce_gpu_kernel!(
+        f, op, initop,
+        dims::NTuple{N, Int},
+        out::OT,
+        inputs::IT
+    ) where {N, OT <: StridedView, IT <: Tuple}
+
+    out_linear = @index(Global, Linear)
+
+    # Non-reduction subspace sizes (1 for reduction dims)
+    nred_sizes = ntuple(Val(N)) do d
+        @inbounds iszero(out.strides[d]) ? 1 : dims[d]
+    end
+    # Reduction subspace sizes (1 for non-reduction dims)
+    red_sizes = ntuple(Val(N)) do d
+        @inbounds iszero(out.strides[d]) ? dims[d] : 1
+    end
+
+    # Map out_linear → cartesian in non-reduction subspace
+    nred_cidx = CartesianIndices(nred_sizes)[out_linear]
+    out_parent = out.offset + 1 + _strides_dot(out.strides, nred_cidx)
+
+    # Initialize accumulator from current output value (or apply initop)
+    @inbounds acc = _gpu_init_acc(initop, out[ParentIndex(out_parent)])
+
+    # Sequential reduction loop over reduction subspace
+    @inbounds for red_linear in Base.OneTo(prod(red_sizes))
+        red_cidx = CartesianIndices(red_sizes)[red_linear]
+        complete_cidx = CartesianIndex(
+            ntuple(Val(N)) do d
+                @inbounds nred_cidx[d] + red_cidx[d] - 1
+            end
+        )
+
+        val = f(
+            ntuple(Val(length(inputs))) do m
+                @inbounds begin
+                    a = inputs[m]
+                    ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx)
+                    a[ParentIndex(ip)]
+                end
+            end...
+        )
+
+        acc = _gpu_accum(op, acc, val)
+    end
+
+    @inbounds out[ParentIndex(out_parent)] = acc
+end
+
+# GPU-compatible _mapreduce: avoids scalar indexing (first(A), out[ParentIndex(1)])
+# that JLArrays/real GPUs prohibit. Mirrors GPUArrays' neutral_element approach:
+# infer output type via Broadcast machinery, look up the neutral element (errors on
+# unknown ops), fill the output buffer, then read back a single scalar via Array().
+function Strided._mapreduce(
+        f, op, A::GPUStridedView{T, N}, nt = nothing
+    ) where {T, N}
+    if length(A) == 0
+        b = Base.mapreduce_empty(f, op, T)
+        return nt === nothing ? b : op(b, nt.init)
+    end
+
+    dims = size(A)
+
+    if nt === nothing
+        ET = Base.Broadcast.combine_eltypes(f, (A,))
+        ET = Base.promote_op(op, ET, ET)
+        (ET === Union{} || ET === Any) &&
+            error("cannot infer output element type for mapreduce; pass an explicit `init`")
+        init = GPUArrays.neutral_element(op, ET)
+    else
+        ET = typeof(nt.init)
+        init = nt.init
+    end
+
+    out = similar(parent(A), ET, (1,))
+    fill!(out, init)
+
+    Strided._mapreducedim!(f, op, nothing, dims, (sreshape(StridedView(out), one.(dims)), A))
+
+    return Array(out)[1]
+end
+
+function Strided._mapreduce_fuse!(
+        f, op, initop,
+        dims::Dims{N},
+        arrays::Tuple{GPUStridedView{TO, N}, Vararg{GPUStridedView{<:Any, N}}}
+    ) where {TO, N}
+
+    out = arrays[1]
+    inputs_raw = Base.tail(arrays)
+    M = length(inputs_raw)
+    inputs = ntuple(i -> inputs_raw[i], Val(M))
+
+    # Number of output elements = product of non-reduction dims
+    out_total = prod(
+        ntuple(Val(N)) do d
+            @inbounds iszero(out.strides[d]) ? 1 : dims[d]
+        end
+    )
+
+    backend = KernelAbstractions.get_backend(parent(out))
+    kernel! = _mapreduce_gpu_kernel!(backend)
+    kernel!(f, op, initop, dims, out, inputs; ndrange = out_total)
+
+    return nothing
+end
+
 end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index b480a70..b3151c9 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -20,8 +20,21 @@ function Broadcast.BroadcastStyle(
 end
 
 function Base.similar(bc::Broadcasted{<:StridedArrayStyle{N}}, ::Type{T}) where {N, T}
-    return StridedView(similar(convert(Broadcasted{DefaultArrayStyle{N}}, bc), T))
+    sv = _find_strided_view(bc)
+    if sv !== nothing
+        return StridedView(similar(parent(sv), T, size(bc)))
+    end
+    return StridedView(similar(Array{T}, axes(bc)))
+end
+
+@inline _find_strided_view(bc::Broadcasted) = _find_strided_view(bc.args...)
+@inline _find_strided_view(sv::StridedView, rest...) = sv
+@inline function _find_strided_view(nested::Broadcasted, rest...)
+    sv = _find_strided_view(nested)
+    return sv === nothing ? _find_strided_view(rest...) : sv
 end
+@inline _find_strided_view(x, rest...) = _find_strided_view(rest...)
+@inline _find_strided_view() = nothing
 
 Base.dotview(a::StridedView{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} = getindex(a, I...)
 
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 8bfd7d9..c60aa5e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -1,21 +1,11 @@
 # Methods based on map!
-function Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N}
-    return map!(identity, dst, src)
-end
+Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N} = map!(identity, dst, src)
 Base.conj!(a::StridedView{<:Real}) = a
 Base.conj!(a::StridedView) = map!(conj, a, a)
-function LinearAlgebra.adjoint!(
-        dst::StridedView{<:Any, N},
-        src::StridedView{<:Any, N}
-    ) where {N}
-    return copy!(dst, adjoint(src))
-end
-function Base.permutedims!(
-        dst::StridedView{<:Any, N}, src::StridedView{<:Any, N},
-        p
-    ) where {N}
-    return copy!(dst, permutedims(src, p))
-end
+LinearAlgebra.adjoint!(dst::StridedView, src::StridedView) = copy!(dst, adjoint(src))
+LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A))
+Base.permutedims!(dst::StridedView, src::StridedView, p) = copy!(dst, permutedims(src, p))
+Base.fill!(A::StridedView, val) = map!(Returns(val), A)
 
 function Base.mapreduce(f, op, A::StridedView; dims = :, kw...)
     return Base._mapreduce_dim(f, op, values(kw), A, dims)
diff --git a/test/jlarrays.jl b/test/jlarrays.jl
deleted file mode 100644
index 5aceb35..0000000
--- a/test/jlarrays.jl
+++ /dev/null
@@ -1,19 +0,0 @@
-@testset for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
-    @testset "Copy with JLArrayStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
-        for m1 in (0, 16, 32), m2 in (0, 16, 32)
-            A1 = JLArray(randn(T, (m1, m2)))
-            A2 = similar(A1)
-            zA1 = JLArray(f1(zeros(T, (m1, m2))))
-            zA2 = JLArray(f2(zeros(T, (m1, m2))))
-            A1c = copy(A1)
-            A2c = copy(A2)
-            B1 = f1(StridedView(A1c))
-            B2 = f2(StridedView(A2c))
-            axes(f1(A1)) == axes(f2(A2)) || continue
-            @test collect(Matrix(copy!(f2(A2), f1(A1)))) == JLArrays.Adapt.adapt(Vector{T}, copy!(B2, B1))
-            @test copy!(zA1, f1(A1)) == copy!(zA2, B1)
-            x = rand(T)
-            @test f1(StridedView(JLArrays.Adapt.adapt(Vector{T}, fill!(A1c, x)))) == JLArrays.Adapt.adapt(Vector{T}, fill!(B1, x))
-        end
-    end
-end
diff --git a/test/mapreduce_tests.jl b/test/mapreduce_tests.jl
new file mode 100644
index 0000000..46b3b34
--- /dev/null
+++ b/test/mapreduce_tests.jl
@@ -0,0 +1,145 @@
+# Parameterized mapreduce / map! tests.
+# Iterates over both Array and JLArray backends internally.
+
+backends = [("Array", identity), ("JLArray", JLArray)]
+
+for (backend_name, make_arr) in backends
+    @testset "$backend_name: map! via StridedView" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            A = make_arr(rand(T, 8, 6))
+            B = similar(A)
+            map!(x -> 2x, StridedView(B), StridedView(A))
+            @test Array(StridedView(B)) ≈ 2 .* Array(StridedView(A))
+        end
+    end
+
+    @testset "$backend_name: mapreducedim! — sum over dim 1" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 1, 6))
+            sum!(StridedView(B), StridedView(A))
+            @test Array(StridedView(B)) ≈ sum(data; dims = 1)
+        end
+    end
+
+    @testset "$backend_name: mapreducedim! — sum over dim 2" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 8, 1))
+            sum!(StridedView(B), StridedView(A))
+            @test Array(StridedView(B)) ≈ sum(data; dims = 2)
+        end
+    end
+
+    @testset "$backend_name: map! with conj/adjoint StridedView" begin
+        for T in (ComplexF32, ComplexF64)
+            data = rand(T, 4, 4)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 4, 4))
+            copy!(adjoint(StridedView(B)), StridedView(A))
+            @test Array(StridedView(B)) ≈ adjoint(data)
+        end
+    end
+
+    @testset "$backend_name: mapreduce — full scalar reduction" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            result = sum(StridedView(A))
+            @test result isa T
+            @test result ≈ sum(data)
+        end
+    end
+
+    # Only meaningful for GPU backends: mixing CPU and GPU inputs must not silently
+    # use the GPU dispatch path.
+    if make_arr !== identity
+        @testset "$backend_name: dispatch requires all inputs on GPU" begin
+            A_gpu = make_arr(rand(Float32, 4, 4))
+            A_cpu = Array(StridedView(A_gpu))
+            B_gpu = make_arr(zeros(Float32, 4, 4))
+            @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu))
+        end
+    end
+
+    @testset "$backend_name: map! — stride-2 input (every other row)" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 4, 6))
+            src = StridedView(A)[1:2:8, :]
+            map!(x -> 2x, StridedView(B), src)
+            @test Array(StridedView(B)) ≈ 2 .* data[1:2:8, :]
+        end
+    end
+
+    @testset "$backend_name: map! — stride-2 output (every other row)" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 4, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 8, 6))
+            dst = StridedView(B)[1:2:8, :]
+            map!(identity, dst, StridedView(A))
+            B_cpu = Array(StridedView(B))
+            @test B_cpu[1:2:8, :] ≈ data
+            @test all(iszero, B_cpu[2:2:8, :])
+        end
+    end
+
+    @testset "$backend_name: map! — subview with nonzero offset" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 5, 4))
+            src = StridedView(A)[2:6, 3:6]
+            map!(x -> x + 1, StridedView(B), src)
+            @test Array(StridedView(B)) ≈ data[2:6, 3:6] .+ 1
+        end
+    end
+
+    @testset "$backend_name: map! — permuted (transposed) strides" begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data = rand(T, 6, 8)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 8, 6))
+            src = permutedims(StridedView(A), (2, 1))
+            map!(identity, StridedView(B), src)
+            @test Array(StridedView(B)) ≈ permutedims(data, (2, 1))
+        end
+    end
+
+    @testset "$backend_name: sum over dim 1 — stride-2 input" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 1, 6))
+            src = StridedView(A)[1:2:8, :]
+            sum!(StridedView(B), src)
+            @test Array(StridedView(B)) ≈ sum(data[1:2:8, :]; dims = 1)
+        end
+    end
+
+    @testset "$backend_name: sum over dim 2 — subview with offset" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            B = make_arr(zeros(T, 5, 1))
+            src = StridedView(A)[2:6, 2:5]
+            sum!(StridedView(B), src)
+            @test Array(StridedView(B)) ≈ sum(data[2:6, 2:5]; dims = 2)
+        end
+    end
+
+    @testset "$backend_name: full scalar reduction — stride-2 and offset subview" begin
+        for T in (Float32, Float64)
+            data = rand(T, 8, 6)
+            A = make_arr(copy(data))
+            r1 = sum(StridedView(A)[1:2:8, :])
+            @test r1 ≈ sum(data[1:2:8, :])
+            r2 = sum(StridedView(A)[3:6, 2:5])
+            @test r2 ≈ sum(data[3:6, 2:5])
+        end
+    end
+end
diff --git a/test/othertests.jl b/test/othertests.jl
index 8c775ce..ea5fee1 100644
--- a/test/othertests.jl
+++ b/test/othertests.jl
@@ -1,157 +1,213 @@
+backends = [("Array", identity), ("JLArray", JLArray)]
+
 @testset "in-place matrix operations" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A1 = randn(T, (1000, 1000))
-        A2 = similar(A1)
-        A1c = copy(A1)
-        A2c = copy(A2)
-        B1 = StridedView(A1c)
-        B2 = StridedView(A2c)
-
-        @test conj!(A1) == conj!(B1)
-        @test adjoint!(A2, A1) == adjoint!(B2, B1)
-        @test transpose!(A2, A1) == transpose!(B2, B1)
-        @test permutedims!(A2, A1, (2, 1)) == permutedims!(B2, B1, (2, 1))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            data1 = randn(T, (1000, 1000))
+            data2 = randn(T, (1000, 1000))
+            # CPU reference
+            A1 = copy(data1); A2 = copy(data2)
+            # Backend under test
+            B1 = StridedView(make_arr(copy(data1)))
+            B2 = StridedView(make_arr(copy(data2)))
+
+            conj!(A1);                        conj!(B1)
+            @test A1 ≈ Array(B1)
+            adjoint!(A2, A1);                 adjoint!(B2, B1)
+            @test A2 ≈ Array(B2)
+            transpose!(A2, A1);               transpose!(B2, B1)
+            @test A2 ≈ Array(B2)
+            permutedims!(A2, A1, (2, 1));     permutedims!(B2, B1, (2, 1))
+            @test A2 ≈ Array(B2)
+        end
     end
 end
 
 @testset "map, scale!, axpy! and axpby! with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        @testset for N in 2:6
-            dims = ntuple(n -> div(60, N), N)
-            R1, R2, R3 = rand(T, dims), rand(T, dims), rand(T, dims)
-            B1 = permutedims(StridedView(R1), randperm(N))
-            B2 = permutedims(StridedView(R2), randperm(N))
-            B3 = permutedims(StridedView(R3), randperm(N))
-            A1 = convert(Array, B1)
-            A2 = convert(Array{T}, B2) # test different converts
-            A3 = convert(Array{T, N}, B3)
-            C1 = deepcopy(B1)
-
-            @test rmul!(B1, 1 // 2) ≈ rmul!(A1, 1 // 2)
-            @test lmul!(1 // 3, B2) ≈ lmul!(1 // 3, A2)
-            @test axpy!(1 // 3, B1, B2) ≈ axpy!(1 // 3, A1, A2)
-            @test axpy!(1, B2, B3) ≈ axpy!(1, A2, A3)
-            @test axpby!(1 // 3, B1, 1 // 2, B3) ≈ axpby!(1 // 3, A1, 1 // 2, A3)
-            @test axpby!(1, B2, 1, B1) ≈ axpby!(1, A2, 1, A1)
-            @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) ≈
-                map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3)
-            @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView
-            @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array
-            @test mul!(B1, 1, B2) ≈ mul!(A1, 1, A2)
-            @test mul!(B1, B2, 1) ≈ mul!(A1, A2, 1)
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            @testset for N in 2:6
+                dims = ntuple(n -> div(60, N), N)
+                perm1, perm2, perm3 = randperm(N), randperm(N), randperm(N)
+                R1_cpu, R2_cpu, R3_cpu = rand(T, dims), rand(T, dims), rand(T, dims)
+                R1 = make_arr(copy(R1_cpu))
+                R2 = make_arr(copy(R2_cpu))
+                R3 = make_arr(copy(R3_cpu))
+                B1 = permutedims(StridedView(R1), perm1)
+                B2 = permutedims(StridedView(R2), perm2)
+                B3 = permutedims(StridedView(R3), perm3)
+                A1 = Array(B1)
+                A2 = Array(B2)
+                A3 = Array(B3)
+
+                @test Array(rmul!(B1, 1 // 2)) ≈ rmul!(A1, 1 // 2)
+                @test Array(lmul!(1 // 3, B2)) ≈ lmul!(1 // 3, A2)
+                @test Array(axpy!(1 // 3, B1, B2)) ≈ axpy!(1 // 3, A1, A2)
+                @test Array(axpy!(1, B2, B3)) ≈ axpy!(1, A2, A3)
+                @test Array(axpby!(1 // 3, B1, 1 // 2, B3)) ≈ axpby!(1 // 3, A1, 1 // 2, A3)
+                @test Array(axpby!(1, B2, 1, B1)) ≈ axpby!(1, A2, 1, A1)
+                @test Array(map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3)) ≈
+                    map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3)
+                @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView
+                if make_arr === identity
+                    @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array
+                end
+                @test Array(mul!(B1, 1, B2)) ≈ mul!(A1, 1, A2)
+                @test Array(mul!(B1, B2, 1)) ≈ mul!(A1, A2, 1)
+            end
         end
     end
 end
 
 @testset "broadcast with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        R1, R2, R3 = rand(T, (10,)), rand(T, (10, 10)), rand(T, (10, 10, 10))
-        B1 = StridedView(R1)
-        B2 = permutedims(StridedView(R2), randperm(2))
-        B3 = permutedims(StridedView(R3), randperm(3))
-        A1 = convert(Array, B1)
-        A2 = convert(Array{T}, B2)
-        A3 = convert(Array{T, 3}, B3)
-
-        @test @inferred(B1 .+ sin.(B2 .- 3)) ≈ A1 .+ sin.(A2 .- 3)
-        @test @inferred(B2' .* B3 .- Ref(0.5)) ≈ A2' .* A3 .- Ref(0.5)
-        @test @inferred(B2' .* B3 .- max.(abs.(B1), real.(B3))) ≈
-            A2' .* A3 .- max.(abs.(A1), real.(A3))
-
-        @test (B1 .+ sin.(B2 .- 3)) isa StridedView
-        @test (B2' .* B3 .- Ref(0.5)) isa StridedView
-        @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView
-        @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            R1_cpu = rand(T, (10,))
+            R2_cpu = rand(T, (10, 10))
+            R3_cpu = rand(T, (10, 10, 10))
+            perm2, perm3 = randperm(2), randperm(3)
+            R1 = make_arr(copy(R1_cpu))
+            R2 = make_arr(copy(R2_cpu))
+            R3 = make_arr(copy(R3_cpu))
+            B1 = StridedView(R1)
+            B2 = permutedims(StridedView(R2), perm2)
+            B3 = permutedims(StridedView(R3), perm3)
+            A1 = Array(B1)
+            A2 = Array(B2)
+            A3 = Array(B3)
+
+            @test Array(@inferred(B1 .+ sin.(B2 .- 3))) ≈ A1 .+ sin.(A2 .- 3)
+            @test Array(@inferred(B2' .* B3 .- Ref(0.5))) ≈ A2' .* A3 .- Ref(0.5)
+            @test Array(@inferred(B2' .* B3 .- max.(abs.(B1), real.(B3)))) ≈
+                A2' .* A3 .- max.(abs.(A1), real.(A3))
+
+            @test (B1 .+ sin.(B2 .- 3)) isa StridedView
+            @test (B2' .* B3 .- Ref(0.5)) isa StridedView
+            @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView
+            if make_arr === identity
+                @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array
+            end
+        end
     end
 end
 
 @testset "broadcast with zero-length StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A1 = StridedView(zeros(T, (2, 0)))
-        A2 = StridedView(zeros(T, (2, 0)))
-        @test (A1 .+ A2) == StridedView(zeros(T, (2, 0)))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            A1 = StridedView(make_arr(zeros(T, (2, 0))))
+            A2 = StridedView(make_arr(zeros(T, (2, 0))))
+            @test Array(A1 .+ A2) == zeros(T, (2, 0))
+        end
     end
 end
 
 @testset "mapreduce with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        R1 = rand(T, (10, 10, 10, 10, 10, 10))
-        @test sum(R1; dims = (1, 3, 5)) ≈ sum(StridedView(R1); dims = (1, 3, 5))
-        @test mapreduce(sin, +, R1; dims = (1, 3, 5)) ≈
-            mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5))
-        R2 = rand(T, (10, 10, 10))
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, identity, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ reshape(R2, (10, 1, 1, 10, 10, 1))
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, x -> 0, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6))
-        R2c = copy(R2)
-        β = rand(T)
-        @test Strided._mapreducedim!(
-            sin, +, x -> β * x, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6)) .+
-            β .* reshape(R2, (10, 1, 1, 10, 10, 1))
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, x -> β, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6), init = β)
-        R2c = copy(R2)
-        @test Strided._mapreducedim!(
-            sin, +, conj, (10, 10, 10, 10, 10, 10),
-            (
-                sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
-                StridedView(R1),
-            )
-        ) ≈
-            mapreduce(sin, +, R1; dims = (2, 3, 6)) .+
-            conj.(reshape(R2, (10, 1, 1, 10, 10, 1)))
-
-        R3 = rand(T, (100, 100, 2))
-        @test sum(R3; dims = (1, 2)) ≈ sum(StridedView(R3); dims = (1, 2))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            R1_cpu = rand(T, (10, 10, 10, 10, 10, 10))
+            R2_cpu = rand(T, (10, 10, 10))
+            R1 = make_arr(copy(R1_cpu))
+            R2 = make_arr(copy(R2_cpu))
+
+            @test sum(StridedView(R1); dims = (1, 3, 5)) isa StridedView
+            @test Array(sum(StridedView(R1); dims = (1, 3, 5))) ≈ sum(R1_cpu; dims = (1, 3, 5))
+            @test Array(mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5))) ≈
+                mapreduce(sin, +, R1_cpu; dims = (1, 3, 5))
+
+            R2c = copy(R2)
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, identity, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
+                )
+            ) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ reshape(R2_cpu, (10, 1, 1, 10, 10, 1))
+
+            R2c = copy(R2)
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, x -> 0, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
+                )
+            ) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6))
+
+            R2c = copy(R2)
+            β = rand(T)
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, x -> β * x, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
+                )
+            ) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+
+                β .* reshape(R2_cpu, (10, 1, 1, 10, 10, 1))
+
+            R2c = copy(R2)
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, x -> β, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
+                )
+            ) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6), init = β)
+
+            R2c = copy(R2)
+            @test Array(
+                Strided._mapreducedim!(
+                    sin, +, conj, (10, 10, 10, 10, 10, 10),
+                    (
+                        sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)),
+                        StridedView(R1),
+                    )
+                )
+            ) ≈
+                mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+
+                conj.(reshape(R2_cpu, (10, 1, 1, 10, 10, 1)))
+
+            R3_cpu = rand(T, (100, 100, 2))
+            R3 = make_arr(copy(R3_cpu))
+            @test Array(sum(StridedView(R3); dims = (1, 2))) ≈ sum(R3_cpu; dims = (1, 2))
+        end
     end
 end
 
 @testset "complete reductions with StridedView" begin
-    @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
-        R1 = rand(T, (10, 10, 10, 10, 10, 10))
-
-        @test sum(R1) ≈ sum(StridedView(R1))
-        @test maximum(abs, R1) ≈ maximum(abs, StridedView(R1))
-        @test minimum(real, R1) ≈ minimum(real, StridedView(R1))
-        @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R1))
-
-        R2 = PermutedDimsArray(R1, (randperm(6)...,))
-
-        @test sum(R2) ≈ sum(StridedView(R2))
-        @test maximum(abs, R2) ≈ maximum(abs, StridedView(R2))
-        @test minimum(real, R2) ≈ minimum(real, StridedView(R2))
-        @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R2))
-
-        R3 = rand(T, (5, 5, 5))
-        @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3)))
+    for (backend_name, make_arr) in backends
+        @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+            R1_cpu = rand(T, (10, 10, 10, 10, 10, 10))
+            R1 = make_arr(copy(R1_cpu))
+
+            @test sum(StridedView(R1)) ≈ sum(R1_cpu)
+            @test maximum(abs, StridedView(R1)) ≈ maximum(abs, R1_cpu)
+            @test minimum(real, StridedView(R1)) ≈ minimum(real, R1_cpu)
+            @test sum(x -> real(x) < 0, StridedView(R1)) == sum(x -> real(x) < 0, R1_cpu)
+
+            perm = (randperm(6)...,)
+            R2_cpu = PermutedDimsArray(R1_cpu, perm)
+            R2 = PermutedDimsArray(R1, perm)
+
+            @test sum(StridedView(R2)) ≈ sum(R2_cpu)
+            @test maximum(abs, StridedView(R2)) ≈ maximum(abs, R2_cpu)
+            @test minimum(real, StridedView(R2)) ≈ minimum(real, R2_cpu)
+            @test sum(x -> real(x) < 0, StridedView(R2)) == sum(x -> real(x) < 0, R1_cpu)
+
+            R3_cpu = rand(T, (5, 5, 5))
+            R3 = make_arr(copy(R3_cpu))
+            @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3)))
+        end
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 4108876..80dd88b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,7 +11,9 @@ Random.seed!(1234)
 is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
 
 if !is_buildkite
-    include("jlarrays.jl")
+    @testset "mapreduce tests" begin
+        include("mapreduce_tests.jl")
+    end
     println("Base.Threads.nthreads() =  $(Base.Threads.nthreads())")
 
     println("Running tests single-threaded:")