From e9cd9c168c0c3ef7037fbd5eba11f1dbb84efc8f Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Sun, 15 Mar 2026 08:48:42 -0400 Subject: [PATCH 01/17] Add GPU mapreduce support via KernelAbstractions kernel Override _mapreduce_fuse! for GPU-backed StridedViews to dispatch to a KernelAbstractions kernel instead of the CPU-specific threaded/SIMD path. One GPU thread per output element with a sequential inner loop over reduction dimensions. Handles pure map (op=nothing), reductions, initop, and conj/adjoint views via ParentIndex semantics. Co-Authored-By: Claude Sonnet 4.6 --- ext/StridedGPUArraysExt.jl | 86 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 608d8b5..7c14ae7 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -3,6 +3,7 @@ module StridedGPUArraysExt using Strided, GPUArrays using GPUArrays: Adapt, KernelAbstractions using GPUArrays.KernelAbstractions: @kernel, @index +using StridedViews: ParentIndex ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} @@ -43,4 +44,89 @@ function Strided.__mul!( return GPUArrays.generic_matmatmul!(C, A, B, α, β) end +# ---------- GPU mapreduce support ---------- + +@inline _gpu_init_acc(::Nothing, current_val) = current_val +@inline _gpu_init_acc(initop, current_val) = initop(current_val) + +@inline _gpu_accum(::Nothing, acc, val) = val +@inline _gpu_accum(op, acc, val) = op(acc, val) + +@inline function _strides_dot(strides::NTuple{N, Int}, cidx::CartesianIndex{N}) where {N} + s = 0 + for d in Base.OneTo(N) + @inbounds s += strides[d] * (cidx[d] - 1) + end + return s +end + +@kernel function _mapreduce_gpu_kernel!( + f, op, initop, + dims::NTuple{N, Int}, + out::OT, + inputs::IT + ) where {N, OT <: StridedView, IT <: Tuple} + + out_linear = @index(Global, Linear) + + # Non-reduction subspace sizes (1 for reduction dims) + nred_sizes = ntuple(Val(N)) do d + @inbounds iszero(out.strides[d]) ? 1 : dims[d] + end + # Reduction subspace sizes (1 for non-reduction dims) + red_sizes = ntuple(Val(N)) do d + @inbounds iszero(out.strides[d]) ? dims[d] : 1 + end + + # Map out_linear → cartesian in non-reduction subspace + nred_cidx = CartesianIndices(nred_sizes)[out_linear] + out_parent = out.offset + 1 + _strides_dot(out.strides, nred_cidx) + + # Initialize accumulator from current output value (or apply initop) + @inbounds acc = _gpu_init_acc(initop, out[ParentIndex(out_parent)]) + + # Sequential reduction loop over reduction subspace + @inbounds for red_linear in Base.OneTo(prod(red_sizes)) + red_cidx = CartesianIndices(red_sizes)[red_linear] + complete_cidx = CartesianIndex(ntuple(Val(N)) do d + @inbounds nred_cidx[d] + red_cidx[d] - 1 + end) + + val = f(ntuple(Val(length(inputs))) do m + @inbounds begin + a = inputs[m] + ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx) + a[ParentIndex(ip)] + end + end...) + + acc = _gpu_accum(op, acc, val) + end + + @inbounds out[ParentIndex(out_parent)] = acc +end + +function Strided._mapreduce_fuse!( + f, op, initop, + dims::Dims{N}, + arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView}} + ) where {TO, N} + + out = arrays[1] + inputs_raw = Base.tail(arrays) + M = length(inputs_raw) + inputs = ntuple(i -> inputs_raw[i], Val(M)) + + # Number of output elements = product of non-reduction dims + out_total = prod(ntuple(Val(N)) do d + @inbounds iszero(out.strides[d]) ? 1 : dims[d] + end) + + backend = KernelAbstractions.get_backend(parent(out)) + kernel! = _mapreduce_gpu_kernel!(backend) + kernel!(f, op, initop, dims, out, inputs; ndrange = out_total) + + return nothing +end + end From 96b7d09f2cd8e121f4796df0af62e336cad542c3 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Sun, 15 Mar 2026 08:49:02 -0400 Subject: [PATCH 02/17] Add GPU mapreduce tests using JLArrays Tests cover: pure map!, reduction over dim 1, reduction over dim 2, conj/adjoint StridedView, and full scalar reduction. JLArrays provides a CPU-backed GPU simulator so tests run without real GPU hardware. Co-Authored-By: Claude Sonnet 4.6 --- test/mapreduce_gpu.jl | 45 +++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 3 +++ 2 files changed, 48 insertions(+) create mode 100644 test/mapreduce_gpu.jl diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl new file mode 100644 index 0000000..3845d25 --- /dev/null +++ b/test/mapreduce_gpu.jl @@ -0,0 +1,45 @@ +using Test, Strided, StridedViews, JLArrays + +@testset "GPU map! via StridedView" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + A = JLArray(rand(T, 8, 6)) + B = similar(A) + map!(x -> 2x, StridedView(B), StridedView(A)) + @test Array(B) ≈ 2 .* Array(A) + end +end + +@testset "GPU mapreducedim! — sum over dim 1" begin + for T in (Float32, Float64) + A = JLArray(rand(T, 8, 6)) + B = JLArray(zeros(T, 1, 6)) + sum!(StridedView(B), StridedView(A)) + @test Array(B) ≈ sum(Array(A); dims = 1) + end +end + +@testset "GPU mapreducedim! — sum over dim 2" begin + for T in (Float32, Float64) + A = JLArray(rand(T, 8, 6)) + B = JLArray(zeros(T, 8, 1)) + sum!(StridedView(B), StridedView(A)) + @test Array(B) ≈ sum(Array(A); dims = 2) + end +end + +@testset "GPU map! with conj/adjoint StridedView" begin + for T in (ComplexF32, ComplexF64) + A = JLArray(rand(T, 4, 4)) + B = JLArray(zeros(T, 4, 4)) + copy!(adjoint(StridedView(B)), StridedView(A)) + @test Array(B) ≈ conj(Array(A)) + end +end + +@testset "GPU mapreduce — full scalar reduction" begin + for T in (Float32, Float64) + A = JLArray(rand(T, 8, 6)) + result = sum(StridedView(A)) + @test result ≈ sum(Array(A)) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 4108876..a26b337 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,6 +12,9 @@ is_buildkite = get(ENV, "BUILDKITE", "false") == "true" if !is_buildkite include("jlarrays.jl") + @testset "JLArray GPU mapreduce" begin + include("mapreduce_gpu.jl") + end println("Base.Threads.nthreads() = $(Base.Threads.nthreads())") println("Running tests single-threaded:") From 5e6900298e17f53e3d263aeffd345e8a8d2587e7 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Sun, 15 Mar 2026 09:04:38 -0400 Subject: [PATCH 03/17] Fix GPU mapreduce: scalar indexing, output type, test expectations Three fixes: - Add _mapreduce GPU override to avoid scalar indexing (first(A), out[ParentIndex(1)]) which JLArrays/real GPUs prohibit; uses zero(T) as proxy for type inference and similar(parent(A),...) to ensure the output stays on the GPU device - Fix adjoint test expectation: copy!(adjoint(B), A) gives B = adjoint(A), not conj(A) - Use qualified names Strided._init_reduction! and Strided._mapreducedim! since they are not exported into the extension module Co-Authored-By: Claude Sonnet 4.6 --- ext/StridedGPUArraysExt.jl | 27 +++++++++++++++++++++++++++ test/mapreduce_gpu.jl | 3 ++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 7c14ae7..5ae0644 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -106,6 +106,33 @@ end @inbounds out[ParentIndex(out_parent)] = acc end +# GPU-compatible _mapreduce: avoids scalar indexing (first(A), out[ParentIndex(1)]) +# that JLArrays/real GPUs prohibit. Uses zero(T) as a proxy to infer the output +# element type without reading from the device. +function Strided._mapreduce( + f, op, A::StridedView{T, N, <:AnyGPUArray{T}}, nt = nothing + ) where {T, N} + if length(A) == 0 + b = Base.mapreduce_empty(f, op, T) + return nt === nothing ? b : op(b, nt.init) + end + + dims = size(A) + + if nt === nothing + a_zero = Base.mapreduce_first(f, op, zero(T)) + out = similar(parent(A), typeof(a_zero), (1,)) + Strided._init_reduction!(out, f, op, a_zero) + else + out = similar(parent(A), typeof(nt.init), (1,)) + fill!(out, nt.init) + end + + Strided._mapreducedim!(f, op, nothing, dims, (sreshape(StridedView(out), one.(dims)), A)) + + return Array(out)[1] +end + function Strided._mapreduce_fuse!( f, op, initop, dims::Dims{N}, diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl index 3845d25..c959c4b 100644 --- a/test/mapreduce_gpu.jl +++ b/test/mapreduce_gpu.jl @@ -32,7 +32,7 @@ end A = JLArray(rand(T, 4, 4)) B = JLArray(zeros(T, 4, 4)) copy!(adjoint(StridedView(B)), StridedView(A)) - @test Array(B) ≈ conj(Array(A)) + @test Array(B) ≈ adjoint(Array(A)) end end @@ -40,6 +40,7 @@ end for T in (Float32, Float64) A = JLArray(rand(T, 8, 6)) result = sum(StridedView(A)) + @test result isa T @test result ≈ sum(Array(A)) end end From b21ea74a02772ee28b3510249ebf4bbdd20da445 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Sun, 15 Mar 2026 09:17:32 -0400 Subject: [PATCH 04/17] Use GPUArrays.neutral_element for _mapreduce init value Replace zero(T) proxy with the same pattern GPUArrays uses: infer the output element type via Broadcast.combine_eltypes + Base.promote_op, then call GPUArrays.neutral_element(op, ET). Unknown operators now produce a clear error message rather than silently using zero(T). Also removes the dependency on the unexported Strided._init_reduction!. Co-Authored-By: Claude Sonnet 4.6 --- ext/StridedGPUArraysExt.jl | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 5ae0644..2023afd 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -107,8 +107,9 @@ end end # GPU-compatible _mapreduce: avoids scalar indexing (first(A), out[ParentIndex(1)]) -# that JLArrays/real GPUs prohibit. Uses zero(T) as a proxy to infer the output -# element type without reading from the device. +# that JLArrays/real GPUs prohibit. Mirrors GPUArrays' neutral_element approach: +# infer output type via Broadcast machinery, look up the neutral element (errors on +# unknown ops), fill the output buffer, then read back a single scalar via Array(). function Strided._mapreduce( f, op, A::StridedView{T, N, <:AnyGPUArray{T}}, nt = nothing ) where {T, N} @@ -120,14 +121,19 @@ function Strided._mapreduce( dims = size(A) if nt === nothing - a_zero = Base.mapreduce_first(f, op, zero(T)) - out = similar(parent(A), typeof(a_zero), (1,)) - Strided._init_reduction!(out, f, op, a_zero) + ET = Base.Broadcast.combine_eltypes(f, (A,)) + ET = Base.promote_op(op, ET, ET) + (ET === Union{} || ET === Any) && + error("cannot infer output element type for mapreduce; pass an explicit `init`") + init = GPUArrays.neutral_element(op, ET) else - out = similar(parent(A), typeof(nt.init), (1,)) - fill!(out, nt.init) + ET = typeof(nt.init) + init = nt.init end + out = similar(parent(A), ET, (1,)) + fill!(out, init) + Strided._mapreducedim!(f, op, nothing, dims, (sreshape(StridedView(out), one.(dims)), A)) return Array(out)[1] From 7b30822fd3df4e82986848e363b2cf0810af429b Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Sun, 15 Mar 2026 09:57:57 -0400 Subject: [PATCH 05/17] Extend GPU mapreduce tests with nontrivial strides and offsets Add 7 new testsets covering: - map! reading from stride-2 input (every other row) - map! writing into stride-2 output, checking untouched rows stay zero - map! on a subview with nonzero offset (2:6, 3:6 slice) - map! with permuted (transposed) strides via permutedims - sum over dim 1 with stride-2 input - sum over dim 2 with offset subview - full scalar reduction on stride-2 and offset subviews Co-Authored-By: Claude Sonnet 4.6 --- test/mapreduce_gpu.jl | 73 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl index c959c4b..99b1f90 100644 --- a/test/mapreduce_gpu.jl +++ b/test/mapreduce_gpu.jl @@ -44,3 +44,76 @@ end @test result ≈ sum(Array(A)) end end + +# ---- nontrivial strides and offsets ---- + +@testset "GPU map! — stride-2 input (every other row)" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + A = JLArray(rand(T, 8, 6)) + B = JLArray(zeros(T, 4, 6)) + src = StridedView(A)[1:2:8, :] # stride 2 in dim 1 + map!(x -> 2x, StridedView(B), src) + @test Array(B) ≈ 2 .* Array(A)[1:2:8, :] + end +end + +@testset "GPU map! — stride-2 output (every other row)" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + A = JLArray(rand(T, 4, 6)) + B = JLArray(zeros(T, 8, 6)) + dst = StridedView(B)[1:2:8, :] # stride 2 in dim 1 + map!(identity, dst, StridedView(A)) + @test Array(B)[1:2:8, :] ≈ Array(A) + @test all(iszero, Array(B)[2:2:8, :]) # untouched rows stay zero + end +end + +@testset "GPU map! — subview with nonzero offset" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + A = JLArray(rand(T, 8, 6)) + B = JLArray(zeros(T, 5, 4)) + src = StridedView(A)[2:6, 3:6] # offset = 1 row + 2 cols + map!(x -> x + 1, StridedView(B), src) + @test Array(B) ≈ Array(A)[2:6, 3:6] .+ 1 + end +end + +@testset "GPU map! — permuted (transposed) strides" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + A = JLArray(rand(T, 6, 8)) + B = JLArray(zeros(T, 8, 6)) + src = permutedims(StridedView(A), (2, 1)) # strides (8,1) → (1,6) after permute: 8×6 view + map!(identity, StridedView(B), src) + @test Array(B) ≈ permutedims(Array(A), (2, 1)) + end +end + +@testset "GPU sum over dim 1 — stride-2 input" begin + for T in (Float32, Float64) + A = JLArray(rand(T, 8, 6)) + B = JLArray(zeros(T, 1, 6)) + src = StridedView(A)[1:2:8, :] # 4×6 with stride 2 + sum!(StridedView(B), src) + @test Array(B) ≈ sum(Array(A)[1:2:8, :]; dims = 1) + end +end + +@testset "GPU sum over dim 2 — subview with offset" begin + for T in (Float32, Float64) + A = JLArray(rand(T, 8, 6)) + B = JLArray(zeros(T, 5, 1)) + src = StridedView(A)[2:6, 2:5] # 5×4, offset = 1 row + 1 col + sum!(StridedView(B), src) + @test Array(B) ≈ sum(Array(A)[2:6, 2:5]; dims = 2) + end +end + +@testset "GPU full scalar reduction — stride-2 and offset subview" begin + for T in (Float32, Float64) + A = JLArray(rand(T, 8, 6)) + r1 = sum(StridedView(A)[1:2:8, :]) # stride-2 + @test r1 ≈ sum(Array(A)[1:2:8, :]) + r2 = sum(StridedView(A)[3:6, 2:5]) # offset subview + @test r2 ≈ sum(Array(A)[3:6, 2:5]) + end +end From 4f66facf5bf60bd20ae0f8d5d2ae0a690ae8486b Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Sun, 15 Mar 2026 10:21:31 -0400 Subject: [PATCH 06/17] Restrict GPU _mapreduce_fuse! dispatch to all-GPU input arrays Change Vararg{StridedView} to Vararg{StridedView{<:Any, N, <:AnyGPUArray}} so the GPU kernel is only dispatched when every input (not just the output) is GPU-backed. Mixed CPU/GPU calls fall through to the CPU path. Add a test confirming the GPU path is bypassed for mixed inputs: the CPU fallback's scalar GPU indexing guard fires, proving the GPU kernel was not called. Co-Authored-By: Claude Sonnet 4.6 --- ext/StridedGPUArraysExt.jl | 2 +- test/mapreduce_gpu.jl | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 2023afd..63f6dd2 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -142,7 +142,7 @@ end function Strided._mapreduce_fuse!( f, op, initop, dims::Dims{N}, - arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView}} + arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView{<:Any, N, <:AnyGPUArray}}} ) where {TO, N} out = arrays[1] diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl index 99b1f90..aae6471 100644 --- a/test/mapreduce_gpu.jl +++ b/test/mapreduce_gpu.jl @@ -47,6 +47,16 @@ end # ---- nontrivial strides and offsets ---- +@testset "GPU dispatch requires all inputs on GPU" begin + # With a CPU input the GPU _mapreduce_fuse! must not be dispatched. + # The CPU fallback fires instead; since the output is GPU-backed it hits + # JLArrays' scalar-indexing guard — confirming the GPU path was bypassed. + A_gpu = JLArray(rand(Float32, 4, 4)) + A_cpu = Array(A_gpu) + B_gpu = JLArray(zeros(Float32, 4, 4)) + @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu)) +end + @testset "GPU map! — stride-2 input (every other row)" begin for T in (Float32, Float64, ComplexF32, ComplexF64) A = JLArray(rand(T, 8, 6)) From 32b66fb349e73ae4551d1f07e2a05d8ac3922fed Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Sun, 15 Mar 2026 10:54:27 -0400 Subject: [PATCH 07/17] Introduce GPUStridedView type alias to reduce verbosity Define const GPUStridedView{T,N} = StridedView{T, N, <:AnyGPUArray{T}} and use it throughout the extension in place of the long-form StridedView{T, N, <:AnyGPUArray{T}} / StridedView{<:Any, N, <:AnyGPUArray} annotations on get_backend, BroadcastStyle, __mul!, _mapreduce, and _mapreduce_fuse!. Co-Authored-By: Claude Sonnet 4.6 --- ext/StridedGPUArraysExt.jl | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 63f6dd2..a3f5a51 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -7,10 +7,13 @@ using StridedViews: ParentIndex ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} -KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv)) +# StridedView backed by any GPU array type, with element type linked to the parent. +const GPUStridedView{T, N} = StridedView{T, N, <:AnyGPUArray{T}} -function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} - raw_style = Base.Broadcast.BroadcastStyle(TA) +KernelAbstractions.get_backend(sv::GPUStridedView) = KernelAbstractions.get_backend(parent(sv)) + +function Base.Broadcast.BroadcastStyle(gpu_sv::GPUStridedView{T, N}) where {T, N} + raw_style = Base.Broadcast.BroadcastStyle(typeof(parent(gpu_sv))) return typeof(raw_style)(Val(N)) # sets the dimensionality correctly end @@ -36,9 +39,9 @@ function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractG end function Strided.__mul!( - C::StridedView{TC, 2, <:AnyGPUArray{TC}}, - A::StridedView{TA, 2, <:AnyGPUArray{TA}}, - B::StridedView{TB, 2, <:AnyGPUArray{TB}}, + C::GPUStridedView{TC, 2}, + A::GPUStridedView{TA, 2}, + B::GPUStridedView{TB, 2}, α::Number, β::Number ) where {TC, TA, TB} return GPUArrays.generic_matmatmul!(C, A, B, α, β) @@ -111,7 +114,7 @@ end # infer output type via Broadcast machinery, look up the neutral element (errors on # unknown ops), fill the output buffer, then read back a single scalar via Array(). function Strided._mapreduce( - f, op, A::StridedView{T, N, <:AnyGPUArray{T}}, nt = nothing + f, op, A::GPUStridedView{T, N}, nt = nothing ) where {T, N} if length(A) == 0 b = Base.mapreduce_empty(f, op, T) @@ -142,7 +145,7 @@ end function Strided._mapreduce_fuse!( f, op, initop, dims::Dims{N}, - arrays::Tuple{StridedView{TO, N, <:AnyGPUArray{TO}}, Vararg{StridedView{<:Any, N, <:AnyGPUArray}}} + arrays::Tuple{GPUStridedView{TO, N}, Vararg{GPUStridedView{<:Any, N}}} ) where {TO, N} out = arrays[1] From eec30de653e3ad2d61e7f1cc7008ad9b77dbb96b Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 11:57:31 -0400 Subject: [PATCH 08/17] some test reworking --- test/mapreduce_gpu.jl | 129 ----------------- test/mapreduce_tests.jl | 145 +++++++++++++++++++ test/othertests.jl | 310 +++++++++++++++++++++++----------------- test/runtests.jl | 4 +- 4 files changed, 325 insertions(+), 263 deletions(-) delete mode 100644 test/mapreduce_gpu.jl create mode 100644 test/mapreduce_tests.jl diff --git a/test/mapreduce_gpu.jl b/test/mapreduce_gpu.jl deleted file mode 100644 index aae6471..0000000 --- a/test/mapreduce_gpu.jl +++ /dev/null @@ -1,129 +0,0 @@ -using Test, Strided, StridedViews, JLArrays - -@testset "GPU map! via StridedView" begin - for T in (Float32, Float64, ComplexF32, ComplexF64) - A = JLArray(rand(T, 8, 6)) - B = similar(A) - map!(x -> 2x, StridedView(B), StridedView(A)) - @test Array(B) ≈ 2 .* Array(A) - end -end - -@testset "GPU mapreducedim! — sum over dim 1" begin - for T in (Float32, Float64) - A = JLArray(rand(T, 8, 6)) - B = JLArray(zeros(T, 1, 6)) - sum!(StridedView(B), StridedView(A)) - @test Array(B) ≈ sum(Array(A); dims = 1) - end -end - -@testset "GPU mapreducedim! — sum over dim 2" begin - for T in (Float32, Float64) - A = JLArray(rand(T, 8, 6)) - B = JLArray(zeros(T, 8, 1)) - sum!(StridedView(B), StridedView(A)) - @test Array(B) ≈ sum(Array(A); dims = 2) - end -end - -@testset "GPU map! with conj/adjoint StridedView" begin - for T in (ComplexF32, ComplexF64) - A = JLArray(rand(T, 4, 4)) - B = JLArray(zeros(T, 4, 4)) - copy!(adjoint(StridedView(B)), StridedView(A)) - @test Array(B) ≈ adjoint(Array(A)) - end -end - -@testset "GPU mapreduce — full scalar reduction" begin - for T in (Float32, Float64) - A = JLArray(rand(T, 8, 6)) - result = sum(StridedView(A)) - @test result isa T - @test result ≈ sum(Array(A)) - end -end - -# ---- nontrivial strides and offsets ---- - -@testset "GPU dispatch requires all inputs on GPU" begin - # With a CPU input the GPU _mapreduce_fuse! must not be dispatched. - # The CPU fallback fires instead; since the output is GPU-backed it hits - # JLArrays' scalar-indexing guard — confirming the GPU path was bypassed. - A_gpu = JLArray(rand(Float32, 4, 4)) - A_cpu = Array(A_gpu) - B_gpu = JLArray(zeros(Float32, 4, 4)) - @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu)) -end - -@testset "GPU map! — stride-2 input (every other row)" begin - for T in (Float32, Float64, ComplexF32, ComplexF64) - A = JLArray(rand(T, 8, 6)) - B = JLArray(zeros(T, 4, 6)) - src = StridedView(A)[1:2:8, :] # stride 2 in dim 1 - map!(x -> 2x, StridedView(B), src) - @test Array(B) ≈ 2 .* Array(A)[1:2:8, :] - end -end - -@testset "GPU map! — stride-2 output (every other row)" begin - for T in (Float32, Float64, ComplexF32, ComplexF64) - A = JLArray(rand(T, 4, 6)) - B = JLArray(zeros(T, 8, 6)) - dst = StridedView(B)[1:2:8, :] # stride 2 in dim 1 - map!(identity, dst, StridedView(A)) - @test Array(B)[1:2:8, :] ≈ Array(A) - @test all(iszero, Array(B)[2:2:8, :]) # untouched rows stay zero - end -end - -@testset "GPU map! — subview with nonzero offset" begin - for T in (Float32, Float64, ComplexF32, ComplexF64) - A = JLArray(rand(T, 8, 6)) - B = JLArray(zeros(T, 5, 4)) - src = StridedView(A)[2:6, 3:6] # offset = 1 row + 2 cols - map!(x -> x + 1, StridedView(B), src) - @test Array(B) ≈ Array(A)[2:6, 3:6] .+ 1 - end -end - -@testset "GPU map! — permuted (transposed) strides" begin - for T in (Float32, Float64, ComplexF32, ComplexF64) - A = JLArray(rand(T, 6, 8)) - B = JLArray(zeros(T, 8, 6)) - src = permutedims(StridedView(A), (2, 1)) # strides (8,1) → (1,6) after permute: 8×6 view - map!(identity, StridedView(B), src) - @test Array(B) ≈ permutedims(Array(A), (2, 1)) - end -end - -@testset "GPU sum over dim 1 — stride-2 input" begin - for T in (Float32, Float64) - A = JLArray(rand(T, 8, 6)) - B = JLArray(zeros(T, 1, 6)) - src = StridedView(A)[1:2:8, :] # 4×6 with stride 2 - sum!(StridedView(B), src) - @test Array(B) ≈ sum(Array(A)[1:2:8, :]; dims = 1) - end -end - -@testset "GPU sum over dim 2 — subview with offset" begin - for T in (Float32, Float64) - A = JLArray(rand(T, 8, 6)) - B = JLArray(zeros(T, 5, 1)) - src = StridedView(A)[2:6, 2:5] # 5×4, offset = 1 row + 1 col - sum!(StridedView(B), src) - @test Array(B) ≈ sum(Array(A)[2:6, 2:5]; dims = 2) - end -end - -@testset "GPU full scalar reduction — stride-2 and offset subview" begin - for T in (Float32, Float64) - A = JLArray(rand(T, 8, 6)) - r1 = sum(StridedView(A)[1:2:8, :]) # stride-2 - @test r1 ≈ sum(Array(A)[1:2:8, :]) - r2 = sum(StridedView(A)[3:6, 2:5]) # offset subview - @test r2 ≈ sum(Array(A)[3:6, 2:5]) - end -end diff --git a/test/mapreduce_tests.jl b/test/mapreduce_tests.jl new file mode 100644 index 0000000..46b3b34 --- /dev/null +++ b/test/mapreduce_tests.jl @@ -0,0 +1,145 @@ +# Parameterized mapreduce / map! tests. +# Iterates over both Array and JLArray backends internally. + +backends = [("Array", identity), ("JLArray", JLArray)] + +for (backend_name, make_arr) in backends + @testset "$backend_name: map! via StridedView" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + A = make_arr(rand(T, 8, 6)) + B = similar(A) + map!(x -> 2x, StridedView(B), StridedView(A)) + @test Array(StridedView(B)) ≈ 2 .* Array(StridedView(A)) + end + end + + @testset "$backend_name: mapreducedim! — sum over dim 1" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 1, 6)) + sum!(StridedView(B), StridedView(A)) + @test Array(StridedView(B)) ≈ sum(data; dims = 1) + end + end + + @testset "$backend_name: mapreducedim! — sum over dim 2" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 8, 1)) + sum!(StridedView(B), StridedView(A)) + @test Array(StridedView(B)) ≈ sum(data; dims = 2) + end + end + + @testset "$backend_name: map! with conj/adjoint StridedView" begin + for T in (ComplexF32, ComplexF64) + data = rand(T, 4, 4) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 4, 4)) + copy!(adjoint(StridedView(B)), StridedView(A)) + @test Array(StridedView(B)) ≈ adjoint(data) + end + end + + @testset "$backend_name: mapreduce — full scalar reduction" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + result = sum(StridedView(A)) + @test result isa T + @test result ≈ sum(data) + end + end + + # Only meaningful for GPU backends: mixing CPU and GPU inputs must not silently + # use the GPU dispatch path. + if make_arr !== identity + @testset "$backend_name: dispatch requires all inputs on GPU" begin + A_gpu = make_arr(rand(Float32, 4, 4)) + A_cpu = Array(StridedView(A_gpu)) + B_gpu = make_arr(zeros(Float32, 4, 4)) + @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu)) + end + end + + @testset "$backend_name: map! — stride-2 input (every other row)" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 4, 6)) + src = StridedView(A)[1:2:8, :] + map!(x -> 2x, StridedView(B), src) + @test Array(StridedView(B)) ≈ 2 .* data[1:2:8, :] + end + end + + @testset "$backend_name: map! — stride-2 output (every other row)" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 4, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 8, 6)) + dst = StridedView(B)[1:2:8, :] + map!(identity, dst, StridedView(A)) + B_cpu = Array(StridedView(B)) + @test B_cpu[1:2:8, :] ≈ data + @test all(iszero, B_cpu[2:2:8, :]) + end + end + + @testset "$backend_name: map! — subview with nonzero offset" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 5, 4)) + src = StridedView(A)[2:6, 3:6] + map!(x -> x + 1, StridedView(B), src) + @test Array(StridedView(B)) ≈ data[2:6, 3:6] .+ 1 + end + end + + @testset "$backend_name: map! — permuted (transposed) strides" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 6, 8) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 8, 6)) + src = permutedims(StridedView(A), (2, 1)) + map!(identity, StridedView(B), src) + @test Array(StridedView(B)) ≈ permutedims(data, (2, 1)) + end + end + + @testset "$backend_name: sum over dim 1 — stride-2 input" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 1, 6)) + src = StridedView(A)[1:2:8, :] + sum!(StridedView(B), src) + @test Array(StridedView(B)) ≈ sum(data[1:2:8, :]; dims = 1) + end + end + + @testset "$backend_name: sum over dim 2 — subview with offset" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 5, 1)) + src = StridedView(A)[2:6, 2:5] + sum!(StridedView(B), src) + @test Array(StridedView(B)) ≈ sum(data[2:6, 2:5]; dims = 2) + end + end + + @testset "$backend_name: full scalar reduction — stride-2 and offset subview" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + r1 = sum(StridedView(A)[1:2:8, :]) + @test r1 ≈ sum(data[1:2:8, :]) + r2 = sum(StridedView(A)[3:6, 2:5]) + @test r2 ≈ sum(data[3:6, 2:5]) + end + end +end diff --git a/test/othertests.jl b/test/othertests.jl index 8c775ce..223eb0f 100644 --- a/test/othertests.jl +++ b/test/othertests.jl @@ -1,157 +1,203 @@ +backends = [("Array", identity), ("JLArray", JLArray)] + @testset "in-place matrix operations" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - A1 = randn(T, (1000, 1000)) - A2 = similar(A1) - A1c = copy(A1) - A2c = copy(A2) - B1 = StridedView(A1c) - B2 = StridedView(A2c) - - @test conj!(A1) == conj!(B1) - @test adjoint!(A2, A1) == adjoint!(B2, B1) - @test transpose!(A2, A1) == transpose!(B2, B1) - @test permutedims!(A2, A1, (2, 1)) == permutedims!(B2, B1, (2, 1)) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + data1 = randn(T, (1000, 1000)) + data2 = randn(T, (1000, 1000)) + # CPU reference + A1 = copy(data1); A2 = copy(data2) + # Backend under test + B1 = StridedView(make_arr(copy(data1))) + B2 = StridedView(make_arr(copy(data2))) + + conj!(A1); conj!(B1) + @test A1 ≈ Array(B1) + adjoint!(A2, A1); adjoint!(B2, B1) + @test A2 ≈ Array(B2) + transpose!(A2, A1); transpose!(B2, B1) + @test A2 ≈ Array(B2) + permutedims!(A2, A1, (2, 1)); permutedims!(B2, B1, (2, 1)) + @test A2 ≈ Array(B2) + end end end @testset "map, scale!, axpy! and axpby! with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - @testset for N in 2:6 - dims = ntuple(n -> div(60, N), N) - R1, R2, R3 = rand(T, dims), rand(T, dims), rand(T, dims) - B1 = permutedims(StridedView(R1), randperm(N)) - B2 = permutedims(StridedView(R2), randperm(N)) - B3 = permutedims(StridedView(R3), randperm(N)) - A1 = convert(Array, B1) - A2 = convert(Array{T}, B2) # test different converts - A3 = convert(Array{T, N}, B3) - C1 = deepcopy(B1) - - @test rmul!(B1, 1 // 2) ≈ rmul!(A1, 1 // 2) - @test lmul!(1 // 3, B2) ≈ lmul!(1 // 3, A2) - @test axpy!(1 // 3, B1, B2) ≈ axpy!(1 // 3, A1, A2) - @test axpy!(1, B2, B3) ≈ axpy!(1, A2, A3) - @test axpby!(1 // 3, B1, 1 // 2, B3) ≈ axpby!(1 // 3, A1, 1 // 2, A3) - @test axpby!(1, B2, 1, B1) ≈ axpby!(1, A2, 1, A1) - @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) ≈ - map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3) - @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView - @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array - @test mul!(B1, 1, B2) ≈ mul!(A1, 1, A2) - @test mul!(B1, B2, 1) ≈ mul!(A1, A2, 1) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + @testset for N in 2:6 + dims = ntuple(n -> div(60, N), N) + perm1, perm2, perm3 = randperm(N), randperm(N), randperm(N) + R1_cpu, R2_cpu, R3_cpu = rand(T, dims), rand(T, dims), rand(T, dims) + R1 = make_arr(copy(R1_cpu)) + R2 = make_arr(copy(R2_cpu)) + R3 = make_arr(copy(R3_cpu)) + B1 = permutedims(StridedView(R1), perm1) + B2 = permutedims(StridedView(R2), perm2) + B3 = permutedims(StridedView(R3), perm3) + A1 = Array(B1) + A2 = Array(B2) + A3 = Array(B3) + + @test Array(rmul!(B1, 1 // 2)) ≈ rmul!(A1, 1 // 2) + @test Array(lmul!(1 // 3, B2)) ≈ lmul!(1 // 3, A2) + @test Array(axpy!(1 // 3, B1, B2)) ≈ axpy!(1 // 3, A1, A2) + @test Array(axpy!(1, B2, B3)) ≈ axpy!(1, A2, A3) + @test Array(axpby!(1 // 3, B1, 1 // 2, B3)) ≈ axpby!(1 // 3, A1, 1 // 2, A3) + @test Array(axpby!(1, B2, 1, B1)) ≈ axpby!(1, A2, 1, A1) + @test Array(map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3)) ≈ + map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3) + @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView + if make_arr === identity + @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array + end + @test Array(mul!(B1, 1, B2)) ≈ mul!(A1, 1, A2) + @test Array(mul!(B1, B2, 1)) ≈ mul!(A1, A2, 1) + end end end end @testset "broadcast with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - R1, R2, R3 = rand(T, (10,)), rand(T, (10, 10)), rand(T, (10, 10, 10)) - B1 = StridedView(R1) - B2 = permutedims(StridedView(R2), randperm(2)) - B3 = permutedims(StridedView(R3), randperm(3)) - A1 = convert(Array, B1) - A2 = convert(Array{T}, B2) - A3 = convert(Array{T, 3}, B3) - - @test @inferred(B1 .+ sin.(B2 .- 3)) ≈ A1 .+ sin.(A2 .- 3) - @test @inferred(B2' .* B3 .- Ref(0.5)) ≈ A2' .* A3 .- Ref(0.5) - @test @inferred(B2' .* B3 .- max.(abs.(B1), real.(B3))) ≈ - A2' .* A3 .- max.(abs.(A1), real.(A3)) - - @test (B1 .+ sin.(B2 .- 3)) isa StridedView - @test (B2' .* B3 .- Ref(0.5)) isa StridedView - @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView - @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + R1_cpu = rand(T, (10,)) + R2_cpu = rand(T, (10, 10)) + R3_cpu = rand(T, (10, 10, 10)) + perm2, perm3 = randperm(2), randperm(3) + R1 = make_arr(copy(R1_cpu)) + R2 = make_arr(copy(R2_cpu)) + R3 = make_arr(copy(R3_cpu)) + B1 = StridedView(R1) + B2 = permutedims(StridedView(R2), perm2) + B3 = permutedims(StridedView(R3), perm3) + A1 = Array(B1) + A2 = Array(B2) + A3 = Array(B3) + + @test Array(@inferred(B1 .+ sin.(B2 .- 3))) ≈ A1 .+ sin.(A2 .- 3) + @test Array(@inferred(B2' .* B3 .- Ref(0.5))) ≈ A2' .* A3 .- Ref(0.5) + @test Array(@inferred(B2' .* B3 .- max.(abs.(B1), real.(B3)))) ≈ + A2' .* A3 .- max.(abs.(A1), real.(A3)) + + @test (B1 .+ sin.(B2 .- 3)) isa StridedView + @test (B2' .* B3 .- Ref(0.5)) isa StridedView + @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView + if make_arr === identity + @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array + end + end end end @testset "broadcast with zero-length StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - A1 = StridedView(zeros(T, (2, 0))) - A2 = StridedView(zeros(T, (2, 0))) - @test (A1 .+ A2) == StridedView(zeros(T, (2, 0))) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + A1 = StridedView(make_arr(zeros(T, (2, 0)))) + A2 = StridedView(make_arr(zeros(T, (2, 0)))) + @test Array(A1 .+ A2) == zeros(T, (2, 0)) + end end end @testset "mapreduce with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - R1 = rand(T, (10, 10, 10, 10, 10, 10)) - @test sum(R1; dims = (1, 3, 5)) ≈ sum(StridedView(R1); dims = (1, 3, 5)) - @test mapreduce(sin, +, R1; dims = (1, 3, 5)) ≈ - mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5)) - R2 = rand(T, (10, 10, 10)) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, identity, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ reshape(R2, (10, 1, 1, 10, 10, 1)) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, x -> 0, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) - R2c = copy(R2) - β = rand(T) - @test Strided._mapreducedim!( - sin, +, x -> β * x, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ - β .* reshape(R2, (10, 1, 1, 10, 10, 1)) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, x -> β, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6), init = β) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, conj, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ - conj.(reshape(R2, (10, 1, 1, 10, 10, 1))) - - R3 = rand(T, (100, 100, 2)) - @test sum(R3; dims = (1, 2)) ≈ sum(StridedView(R3); dims = (1, 2)) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + R1_cpu = rand(T, (10, 10, 10, 10, 10, 10)) + R2_cpu = rand(T, (10, 10, 10)) + R1 = make_arr(copy(R1_cpu)) + R2 = make_arr(copy(R2_cpu)) + + @test sum(StridedView(R1); dims = (1, 3, 5)) isa StridedView + @test Array(sum(StridedView(R1); dims = (1, 3, 5))) ≈ sum(R1_cpu; dims = (1, 3, 5)) + @test Array(mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5))) ≈ + mapreduce(sin, +, R1_cpu; dims = (1, 3, 5)) + + R2c = copy(R2) + @test Array(Strided._mapreducedim!( + sin, +, identity, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + )) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ reshape(R2_cpu, (10, 1, 1, 10, 10, 1)) + + R2c = copy(R2) + @test Array(Strided._mapreducedim!( + sin, +, x -> 0, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + )) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) + + R2c = copy(R2) + β = rand(T) + @test Array(Strided._mapreducedim!( + sin, +, x -> β * x, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + )) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ + β .* reshape(R2_cpu, (10, 1, 1, 10, 10, 1)) + + R2c = copy(R2) + @test Array(Strided._mapreducedim!( + sin, +, x -> β, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + )) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6), init = β) + + R2c = copy(R2) + @test Array(Strided._mapreducedim!( + sin, +, conj, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + )) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ + conj.(reshape(R2_cpu, (10, 1, 1, 10, 10, 1))) + + R3_cpu = rand(T, (100, 100, 2)) + R3 = make_arr(copy(R3_cpu)) + @test Array(sum(StridedView(R3); dims = (1, 2))) ≈ sum(R3_cpu; dims = (1, 2)) + end end end @testset "complete reductions with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - R1 = rand(T, (10, 10, 10, 10, 10, 10)) - - @test sum(R1) ≈ sum(StridedView(R1)) - @test maximum(abs, R1) ≈ maximum(abs, StridedView(R1)) - @test minimum(real, R1) ≈ minimum(real, StridedView(R1)) - @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R1)) - - R2 = PermutedDimsArray(R1, (randperm(6)...,)) - - @test sum(R2) ≈ sum(StridedView(R2)) - @test maximum(abs, R2) ≈ maximum(abs, StridedView(R2)) - @test minimum(real, R2) ≈ minimum(real, StridedView(R2)) - @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R2)) - - R3 = rand(T, (5, 5, 5)) - @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3))) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + R1_cpu = rand(T, (10, 10, 10, 10, 10, 10)) + R1 = make_arr(copy(R1_cpu)) + + @test sum(StridedView(R1)) ≈ sum(R1_cpu) + @test maximum(abs, StridedView(R1)) ≈ maximum(abs, R1_cpu) + @test minimum(real, StridedView(R1)) ≈ minimum(real, R1_cpu) + @test sum(x -> real(x) < 0, StridedView(R1)) == sum(x -> real(x) < 0, R1_cpu) + + perm = (randperm(6)...,) + R2_cpu = PermutedDimsArray(R1_cpu, perm) + R2 = PermutedDimsArray(R1, perm) + + @test sum(StridedView(R2)) ≈ sum(R2_cpu) + @test maximum(abs, StridedView(R2)) ≈ maximum(abs, R2_cpu) + @test minimum(real, StridedView(R2)) ≈ minimum(real, R2_cpu) + @test sum(x -> real(x) < 0, StridedView(R2)) == sum(x -> real(x) < 0, R1_cpu) + + R3_cpu = rand(T, (5, 5, 5)) + R3 = make_arr(copy(R3_cpu)) + @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3))) + end end end diff --git a/test/runtests.jl b/test/runtests.jl index a26b337..3607bee 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,8 +12,8 @@ is_buildkite = get(ENV, "BUILDKITE", "false") == "true" if !is_buildkite include("jlarrays.jl") - @testset "JLArray GPU mapreduce" begin - include("mapreduce_gpu.jl") + @testset "mapreduce tests" begin + include("mapreduce_tests.jl") end println("Base.Threads.nthreads() = $(Base.Threads.nthreads())") From 936c38e46624feb0622d51a0e69340044a613c35 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 11:57:57 -0400 Subject: [PATCH 09/17] hijack some more linearalgebra methods --- src/linalg.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/linalg.jl b/src/linalg.jl index 5b054ca..134db79 100644 --- a/src/linalg.jl +++ b/src/linalg.jl @@ -2,6 +2,13 @@ LinearAlgebra.rmul!(dst::StridedView, α::Number) = mul!(dst, dst, α) LinearAlgebra.lmul!(α::Number, dst::StridedView) = mul!(dst, α, dst) +LinearAlgebra.adjoint!(C::StridedView, A::StridedView) = copy!(C, adjoint(A)) +LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A)) +function Base.permutedims!(C::StridedView{T, N}, A::StridedView{T, N}, perm) where {T, N} + copy!(C, permutedims(A, perm)) + return C +end + function LinearAlgebra.mul!( dst::StridedView{<:Number, N}, α::Number, src::StridedView{<:Number, N} From c935288174437bef4e336e8302a63eb975c2ab26 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 11:58:06 -0400 Subject: [PATCH 10/17] bypass something --- ext/StridedGPUArraysExt.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index a3f5a51..a2e4c34 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -24,6 +24,14 @@ function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS return dst end +# Conversion to CPU Array: materialise into a contiguous GPU array first (so the +# GPU-to-GPU copy! path is used), then let the GPU array type handle the transfer. +function Base.Array(a::GPUStridedView{T, N}) where {T, N} + b = similar(parent(a), T, size(a)) + copy!(StridedView(b), a) + return Array(b) +end + # lifted from GPUArrays.jl function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS} isempty(A) && return A From 42bb19b2afdc911950aea435a58befbe982c8fc2 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 13:17:03 -0400 Subject: [PATCH 11/17] correctly allocate output type --- src/broadcast.jl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/broadcast.jl b/src/broadcast.jl index b480a70..bc816b8 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -20,8 +20,21 @@ function Broadcast.BroadcastStyle( end function Base.similar(bc::Broadcasted{<:StridedArrayStyle{N}}, ::Type{T}) where {N, T} - return StridedView(similar(convert(Broadcasted{DefaultArrayStyle{N}}, bc), T)) + sv = _find_strided_view(bc) + if sv !== nothing + return StridedView(similar(parent(sv), T, size(bc))) + end + return StridedView(similar(Array{T}, axes(bc))) +end + +@inline _find_strided_view(bc::Broadcasted) = _find_strided_view(bc.args...) +@inline _find_strided_view(sv::StridedView, rest...) = sv +@inline function _find_strided_view(nested::Broadcasted, rest...) + sv = _find_strided_view(nested) + sv === nothing ? _find_strided_view(rest...) : sv end +@inline _find_strided_view(x, rest...) = _find_strided_view(rest...) +@inline _find_strided_view() = nothing Base.dotview(a::StridedView{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} = getindex(a, I...) From 5f991eeee72dca4d30f78543f0a3aeedfca77146 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 13:17:32 -0400 Subject: [PATCH 12/17] formatter --- ext/StridedGPUArraysExt.jl | 32 ++++++++++------- src/broadcast.jl | 2 +- test/othertests.jl | 70 ++++++++++++++++++++++---------------- 3 files changed, 60 insertions(+), 44 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index a2e4c34..2e7a8d3 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -99,17 +99,21 @@ end # Sequential reduction loop over reduction subspace @inbounds for red_linear in Base.OneTo(prod(red_sizes)) red_cidx = CartesianIndices(red_sizes)[red_linear] - complete_cidx = CartesianIndex(ntuple(Val(N)) do d - @inbounds nred_cidx[d] + red_cidx[d] - 1 - end) - - val = f(ntuple(Val(length(inputs))) do m - @inbounds begin - a = inputs[m] - ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx) - a[ParentIndex(ip)] + complete_cidx = CartesianIndex( + ntuple(Val(N)) do d + @inbounds nred_cidx[d] + red_cidx[d] - 1 end - end...) + ) + + val = f( + ntuple(Val(length(inputs))) do m + @inbounds begin + a = inputs[m] + ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx) + a[ParentIndex(ip)] + end + end... + ) acc = _gpu_accum(op, acc, val) end @@ -162,9 +166,11 @@ function Strided._mapreduce_fuse!( inputs = ntuple(i -> inputs_raw[i], Val(M)) # Number of output elements = product of non-reduction dims - out_total = prod(ntuple(Val(N)) do d - @inbounds iszero(out.strides[d]) ? 1 : dims[d] - end) + out_total = prod( + ntuple(Val(N)) do d + @inbounds iszero(out.strides[d]) ? 1 : dims[d] + end + ) backend = KernelAbstractions.get_backend(parent(out)) kernel! = _mapreduce_gpu_kernel!(backend) diff --git a/src/broadcast.jl b/src/broadcast.jl index bc816b8..b3151c9 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -31,7 +31,7 @@ end @inline _find_strided_view(sv::StridedView, rest...) = sv @inline function _find_strided_view(nested::Broadcasted, rest...) sv = _find_strided_view(nested) - sv === nothing ? _find_strided_view(rest...) : sv + return sv === nothing ? _find_strided_view(rest...) : sv end @inline _find_strided_view(x, rest...) = _find_strided_view(rest...) @inline _find_strided_view() = nothing diff --git a/test/othertests.jl b/test/othertests.jl index 223eb0f..ea5fee1 100644 --- a/test/othertests.jl +++ b/test/othertests.jl @@ -115,55 +115,65 @@ end mapreduce(sin, +, R1_cpu; dims = (1, 3, 5)) R2c = copy(R2) - @test Array(Strided._mapreducedim!( - sin, +, identity, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), + @test Array( + Strided._mapreducedim!( + sin, +, identity, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) ) - )) ≈ + ) ≈ mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ reshape(R2_cpu, (10, 1, 1, 10, 10, 1)) R2c = copy(R2) - @test Array(Strided._mapreducedim!( - sin, +, x -> 0, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), + @test Array( + Strided._mapreducedim!( + sin, +, x -> 0, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) ) - )) ≈ + ) ≈ mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) R2c = copy(R2) β = rand(T) - @test Array(Strided._mapreducedim!( - sin, +, x -> β * x, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), + @test Array( + Strided._mapreducedim!( + sin, +, x -> β * x, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) ) - )) ≈ + ) ≈ mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ β .* reshape(R2_cpu, (10, 1, 1, 10, 10, 1)) R2c = copy(R2) - @test Array(Strided._mapreducedim!( - sin, +, x -> β, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), + @test Array( + Strided._mapreducedim!( + sin, +, x -> β, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) ) - )) ≈ + ) ≈ mapreduce(sin, +, R1_cpu; dims = (2, 3, 6), init = β) R2c = copy(R2) - @test Array(Strided._mapreducedim!( - sin, +, conj, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), + @test Array( + Strided._mapreducedim!( + sin, +, conj, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) ) - )) ≈ + ) ≈ mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ conj.(reshape(R2_cpu, (10, 1, 1, 10, 10, 1))) From 87a801ceba2390abab860946d9f4778639e6905e Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 14:21:56 -0400 Subject: [PATCH 13/17] remove duplicate definitions --- src/linalg.jl | 7 ------- src/mapreduce.jl | 15 +++------------ 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/linalg.jl b/src/linalg.jl index 134db79..5b054ca 100644 --- a/src/linalg.jl +++ b/src/linalg.jl @@ -2,13 +2,6 @@ LinearAlgebra.rmul!(dst::StridedView, α::Number) = mul!(dst, dst, α) LinearAlgebra.lmul!(α::Number, dst::StridedView) = mul!(dst, α, dst) -LinearAlgebra.adjoint!(C::StridedView, A::StridedView) = copy!(C, adjoint(A)) -LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A)) -function Base.permutedims!(C::StridedView{T, N}, A::StridedView{T, N}, perm) where {T, N} - copy!(C, permutedims(A, perm)) - return C -end - function LinearAlgebra.mul!( dst::StridedView{<:Number, N}, α::Number, src::StridedView{<:Number, N} diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 8bfd7d9..a7d2f55 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -4,18 +4,9 @@ function Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) wher end Base.conj!(a::StridedView{<:Real}) = a Base.conj!(a::StridedView) = map!(conj, a, a) -function LinearAlgebra.adjoint!( - dst::StridedView{<:Any, N}, - src::StridedView{<:Any, N} - ) where {N} - return copy!(dst, adjoint(src)) -end -function Base.permutedims!( - dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}, - p - ) where {N} - return copy!(dst, permutedims(src, p)) -end +LinearAlgebra.adjoint!(dst::StridedView, src::StridedView) = copy!(dst, adjoint(src)) +LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A)) +Base.permutedims!(dst::StridedView, src::StridedView, p) = copy!(dst, permutedims(src, p)) function Base.mapreduce(f, op, A::StridedView; dims = :, kw...) return Base._mapreduce_dim(f, op, values(kw), A, dims) From 971056cf603a47cf4152beab50577bcecb3b7b73 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 14:25:43 -0400 Subject: [PATCH 14/17] add Metal to test deps --- Project.toml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index f6bf082..8774c8b 100644 --- a/Project.toml +++ b/Project.toml @@ -10,23 +10,24 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" -GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" [extensions] StridedAMDGPUExt = "AMDGPU" -StridedJLArraysExt = "JLArrays" -StridedGPUArraysExt = "GPUArrays" StridedCUDAExt = "CUDA" +StridedGPUArraysExt = "GPUArrays" +StridedJLArraysExt = "JLArrays" [compat] AMDGPU = "2" Aqua = "0.8" CUDA = "5" -JLArrays = "0.3.1" GPUArrays = "11.4.1" +JLArrays = "0.3.1" LinearAlgebra = "1.6" +Metal = "1.9" Random = "1.6" StridedViews = "0.4.6" Test = "1.6" @@ -37,10 +38,11 @@ julia = "1.6" AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays"] +test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays", "Metal"] From 3872de423872755edb44a5943dcf1800a8863a27 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 16:35:11 -0400 Subject: [PATCH 15/17] remove fill! specialization --- ext/StridedGPUArraysExt.jl | 14 -------------- src/mapreduce.jl | 5 ++--- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 2e7a8d3..149c476 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -32,20 +32,6 @@ function Base.Array(a::GPUStridedView{T, N}) where {T, N} return Array(b) end -# lifted from GPUArrays.jl -function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS} - isempty(A) && return A - @kernel function fill_kernel!(a, val) - idx = @index(Global, Cartesian) - @inbounds a[idx] = val - end - # ndims check for 0D support - kernel = fill_kernel!(KernelAbstractions.get_backend(A)) - f_x = F <: Union{typeof(conj), typeof(adjoint)} ? conj(x) : x - kernel(A, f_x; ndrange = size(A)) - return A -end - function Strided.__mul!( C::GPUStridedView{TC, 2}, A::GPUStridedView{TA, 2}, diff --git a/src/mapreduce.jl b/src/mapreduce.jl index a7d2f55..c60aa5e 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -1,12 +1,11 @@ # Methods based on map! -function Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N} - return map!(identity, dst, src) -end +Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N} = map!(identity, dst, src) Base.conj!(a::StridedView{<:Real}) = a Base.conj!(a::StridedView) = map!(conj, a, a) LinearAlgebra.adjoint!(dst::StridedView, src::StridedView) = copy!(dst, adjoint(src)) LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A)) Base.permutedims!(dst::StridedView, src::StridedView, p) = copy!(dst, permutedims(src, p)) +Base.fill!(A::StridedView, val) = map!(Returns(val), A) function Base.mapreduce(f, op, A::StridedView; dims = :, kw...) return Base._mapreduce_dim(f, op, values(kw), A, dims) From 90521ee9ace9fb3900b64375d257f8fe134a12ce Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 16:36:12 -0400 Subject: [PATCH 16/17] remove some GPU specializations --- ext/StridedGPUArraysExt.jl | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 149c476..e4f58a1 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -12,22 +12,10 @@ const GPUStridedView{T, N} = StridedView{T, N, <:AnyGPUArray{T}} KernelAbstractions.get_backend(sv::GPUStridedView) = KernelAbstractions.get_backend(parent(sv)) -function Base.Broadcast.BroadcastStyle(gpu_sv::GPUStridedView{T, N}) where {T, N} - raw_style = Base.Broadcast.BroadcastStyle(typeof(parent(gpu_sv))) - return typeof(raw_style)(Val(N)) # sets the dimensionality correctly -end - -function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TS <: Number, NS, TAS <: AbstractGPUArray{TS}, FS <: ALL_FS} - bc_style = Base.Broadcast.BroadcastStyle(TAS) - bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) - GPUArrays._copyto!(dst, bc) - return dst -end - # Conversion to CPU Array: materialise into a contiguous GPU array first (so the # GPU-to-GPU copy! path is used), then let the GPU array type handle the transfer. -function Base.Array(a::GPUStridedView{T, N}) where {T, N} - b = similar(parent(a), T, size(a)) +function Base.Array(a::GPUStridedView) + b = similar(parent(a), eltype(a), size(a)) copy!(StridedView(b), a) return Array(b) end From 4f2f364a08516e35c2fc2bccb8cdee23dac9f76f Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 16 Mar 2026 16:36:17 -0400 Subject: [PATCH 17/17] cleanup tests --- test/jlarrays.jl | 19 ------------------- test/runtests.jl | 1 - 2 files changed, 20 deletions(-) delete mode 100644 test/jlarrays.jl diff --git a/test/jlarrays.jl b/test/jlarrays.jl deleted file mode 100644 index 5aceb35..0000000 --- a/test/jlarrays.jl +++ /dev/null @@ -1,19 +0,0 @@ -@testset for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) - @testset "Copy with JLArrayStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) - for m1 in (0, 16, 32), m2 in (0, 16, 32) - A1 = JLArray(randn(T, (m1, m2))) - A2 = similar(A1) - zA1 = JLArray(f1(zeros(T, (m1, m2)))) - zA2 = JLArray(f2(zeros(T, (m1, m2)))) - A1c = copy(A1) - A2c = copy(A2) - B1 = f1(StridedView(A1c)) - B2 = f2(StridedView(A2c)) - axes(f1(A1)) == axes(f2(A2)) || continue - @test collect(Matrix(copy!(f2(A2), f1(A1)))) == JLArrays.Adapt.adapt(Vector{T}, copy!(B2, B1)) - @test copy!(zA1, f1(A1)) == copy!(zA2, B1) - x = rand(T) - @test f1(StridedView(JLArrays.Adapt.adapt(Vector{T}, fill!(A1c, x)))) == JLArrays.Adapt.adapt(Vector{T}, fill!(B1, x)) - end - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 3607bee..80dd88b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,7 +11,6 @@ Random.seed!(1234) is_buildkite = get(ENV, "BUILDKITE", "false") == "true" if !is_buildkite - include("jlarrays.jl") @testset "mapreduce tests" begin include("mapreduce_tests.jl") end