diff --git a/Project.toml b/Project.toml index f6bf082..8774c8b 100644 --- a/Project.toml +++ b/Project.toml @@ -10,23 +10,24 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" -GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" [extensions] StridedAMDGPUExt = "AMDGPU" -StridedJLArraysExt = "JLArrays" -StridedGPUArraysExt = "GPUArrays" StridedCUDAExt = "CUDA" +StridedGPUArraysExt = "GPUArrays" +StridedJLArraysExt = "JLArrays" [compat] AMDGPU = "2" Aqua = "0.8" CUDA = "5" -JLArrays = "0.3.1" GPUArrays = "11.4.1" +JLArrays = "0.3.1" LinearAlgebra = "1.6" +Metal = "1.9" Random = "1.6" StridedViews = "0.4.6" Test = "1.6" @@ -37,10 +38,11 @@ julia = "1.6" AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays"] +test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays", "JLArrays", "Metal"] diff --git a/ext/StridedGPUArraysExt.jl b/ext/StridedGPUArraysExt.jl index 608d8b5..e4f58a1 100644 --- a/ext/StridedGPUArraysExt.jl +++ b/ext/StridedGPUArraysExt.jl @@ -3,44 +3,154 @@ module StridedGPUArraysExt using Strided, GPUArrays using GPUArrays: Adapt, KernelAbstractions using GPUArrays.KernelAbstractions: @kernel, @index +using StridedViews: ParentIndex ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)} -KernelAbstractions.get_backend(sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} = KernelAbstractions.get_backend(parent(sv)) +# StridedView backed by any GPU array type, with element type linked to the parent. +const GPUStridedView{T, N} = StridedView{T, N, <:AnyGPUArray{T}} -function Base.Broadcast.BroadcastStyle(gpu_sv::StridedView{T, N, TA}) where {T, N, TA <: AnyGPUArray{T}} - raw_style = Base.Broadcast.BroadcastStyle(TA) - return typeof(raw_style)(Val(N)) # sets the dimensionality correctly -end +KernelAbstractions.get_backend(sv::GPUStridedView) = KernelAbstractions.get_backend(parent(sv)) -function Base.copy!(dst::AbstractArray{TD, ND}, src::StridedView{TS, NS, TAS, FS}) where {TD <: Number, ND, TS <: Number, NS, TAS <: AbstractGPUArray{TS}, FS <: ALL_FS} - bc_style = Base.Broadcast.BroadcastStyle(TAS) - bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst)) - GPUArrays._copyto!(dst, bc) - return dst -end - -# lifted from GPUArrays.jl -function Base.fill!(A::StridedView{T, N, TA, F}, x) where {T, N, TA <: AbstractGPUArray{T}, F <: ALL_FS} - isempty(A) && return A - @kernel function fill_kernel!(a, val) - idx = @index(Global, Cartesian) - @inbounds a[idx] = val - end - # ndims check for 0D support - kernel = fill_kernel!(KernelAbstractions.get_backend(A)) - f_x = F <: Union{typeof(conj), typeof(adjoint)} ? conj(x) : x - kernel(A, f_x; ndrange = size(A)) - return A +# Conversion to CPU Array: materialise into a contiguous GPU array first (so the +# GPU-to-GPU copy! path is used), then let the GPU array type handle the transfer. +function Base.Array(a::GPUStridedView) + b = similar(parent(a), eltype(a), size(a)) + copy!(StridedView(b), a) + return Array(b) end function Strided.__mul!( - C::StridedView{TC, 2, <:AnyGPUArray{TC}}, - A::StridedView{TA, 2, <:AnyGPUArray{TA}}, - B::StridedView{TB, 2, <:AnyGPUArray{TB}}, + C::GPUStridedView{TC, 2}, + A::GPUStridedView{TA, 2}, + B::GPUStridedView{TB, 2}, α::Number, β::Number ) where {TC, TA, TB} return GPUArrays.generic_matmatmul!(C, A, B, α, β) end +# ---------- GPU mapreduce support ---------- + +@inline _gpu_init_acc(::Nothing, current_val) = current_val +@inline _gpu_init_acc(initop, current_val) = initop(current_val) + +@inline _gpu_accum(::Nothing, acc, val) = val +@inline _gpu_accum(op, acc, val) = op(acc, val) + +@inline function _strides_dot(strides::NTuple{N, Int}, cidx::CartesianIndex{N}) where {N} + s = 0 + for d in Base.OneTo(N) + @inbounds s += strides[d] * (cidx[d] - 1) + end + return s +end + +@kernel function _mapreduce_gpu_kernel!( + f, op, initop, + dims::NTuple{N, Int}, + out::OT, + inputs::IT + ) where {N, OT <: StridedView, IT <: Tuple} + + out_linear = @index(Global, Linear) + + # Non-reduction subspace sizes (1 for reduction dims) + nred_sizes = ntuple(Val(N)) do d + @inbounds iszero(out.strides[d]) ? 1 : dims[d] + end + # Reduction subspace sizes (1 for non-reduction dims) + red_sizes = ntuple(Val(N)) do d + @inbounds iszero(out.strides[d]) ? dims[d] : 1 + end + + # Map out_linear → cartesian in non-reduction subspace + nred_cidx = CartesianIndices(nred_sizes)[out_linear] + out_parent = out.offset + 1 + _strides_dot(out.strides, nred_cidx) + + # Initialize accumulator from current output value (or apply initop) + @inbounds acc = _gpu_init_acc(initop, out[ParentIndex(out_parent)]) + + # Sequential reduction loop over reduction subspace + @inbounds for red_linear in Base.OneTo(prod(red_sizes)) + red_cidx = CartesianIndices(red_sizes)[red_linear] + complete_cidx = CartesianIndex( + ntuple(Val(N)) do d + @inbounds nred_cidx[d] + red_cidx[d] - 1 + end + ) + + val = f( + ntuple(Val(length(inputs))) do m + @inbounds begin + a = inputs[m] + ip = a.offset + 1 + _strides_dot(a.strides, complete_cidx) + a[ParentIndex(ip)] + end + end... + ) + + acc = _gpu_accum(op, acc, val) + end + + @inbounds out[ParentIndex(out_parent)] = acc +end + +# GPU-compatible _mapreduce: avoids scalar indexing (first(A), out[ParentIndex(1)]) +# that JLArrays/real GPUs prohibit. Mirrors GPUArrays' neutral_element approach: +# infer output type via Broadcast machinery, look up the neutral element (errors on +# unknown ops), fill the output buffer, then read back a single scalar via Array(). +function Strided._mapreduce( + f, op, A::GPUStridedView{T, N}, nt = nothing + ) where {T, N} + if length(A) == 0 + b = Base.mapreduce_empty(f, op, T) + return nt === nothing ? b : op(b, nt.init) + end + + dims = size(A) + + if nt === nothing + ET = Base.Broadcast.combine_eltypes(f, (A,)) + ET = Base.promote_op(op, ET, ET) + (ET === Union{} || ET === Any) && + error("cannot infer output element type for mapreduce; pass an explicit `init`") + init = GPUArrays.neutral_element(op, ET) + else + ET = typeof(nt.init) + init = nt.init + end + + out = similar(parent(A), ET, (1,)) + fill!(out, init) + + Strided._mapreducedim!(f, op, nothing, dims, (sreshape(StridedView(out), one.(dims)), A)) + + return Array(out)[1] +end + +function Strided._mapreduce_fuse!( + f, op, initop, + dims::Dims{N}, + arrays::Tuple{GPUStridedView{TO, N}, Vararg{GPUStridedView{<:Any, N}}} + ) where {TO, N} + + out = arrays[1] + inputs_raw = Base.tail(arrays) + M = length(inputs_raw) + inputs = ntuple(i -> inputs_raw[i], Val(M)) + + # Number of output elements = product of non-reduction dims + out_total = prod( + ntuple(Val(N)) do d + @inbounds iszero(out.strides[d]) ? 1 : dims[d] + end + ) + + backend = KernelAbstractions.get_backend(parent(out)) + kernel! = _mapreduce_gpu_kernel!(backend) + kernel!(f, op, initop, dims, out, inputs; ndrange = out_total) + + return nothing +end + end diff --git a/src/broadcast.jl b/src/broadcast.jl index b480a70..b3151c9 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -20,8 +20,21 @@ function Broadcast.BroadcastStyle( end function Base.similar(bc::Broadcasted{<:StridedArrayStyle{N}}, ::Type{T}) where {N, T} - return StridedView(similar(convert(Broadcasted{DefaultArrayStyle{N}}, bc), T)) + sv = _find_strided_view(bc) + if sv !== nothing + return StridedView(similar(parent(sv), T, size(bc))) + end + return StridedView(similar(Array{T}, axes(bc))) +end + +@inline _find_strided_view(bc::Broadcasted) = _find_strided_view(bc.args...) +@inline _find_strided_view(sv::StridedView, rest...) = sv +@inline function _find_strided_view(nested::Broadcasted, rest...) + sv = _find_strided_view(nested) + return sv === nothing ? _find_strided_view(rest...) : sv end +@inline _find_strided_view(x, rest...) = _find_strided_view(rest...) +@inline _find_strided_view() = nothing Base.dotview(a::StridedView{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} = getindex(a, I...) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 8bfd7d9..c60aa5e 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -1,21 +1,11 @@ # Methods based on map! -function Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N} - return map!(identity, dst, src) -end +Base.copy!(dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}) where {N} = map!(identity, dst, src) Base.conj!(a::StridedView{<:Real}) = a Base.conj!(a::StridedView) = map!(conj, a, a) -function LinearAlgebra.adjoint!( - dst::StridedView{<:Any, N}, - src::StridedView{<:Any, N} - ) where {N} - return copy!(dst, adjoint(src)) -end -function Base.permutedims!( - dst::StridedView{<:Any, N}, src::StridedView{<:Any, N}, - p - ) where {N} - return copy!(dst, permutedims(src, p)) -end +LinearAlgebra.adjoint!(dst::StridedView, src::StridedView) = copy!(dst, adjoint(src)) +LinearAlgebra.transpose!(C::StridedView, A::StridedView) = copy!(C, transpose(A)) +Base.permutedims!(dst::StridedView, src::StridedView, p) = copy!(dst, permutedims(src, p)) +Base.fill!(A::StridedView, val) = map!(Returns(val), A) function Base.mapreduce(f, op, A::StridedView; dims = :, kw...) return Base._mapreduce_dim(f, op, values(kw), A, dims) diff --git a/test/jlarrays.jl b/test/jlarrays.jl deleted file mode 100644 index 5aceb35..0000000 --- a/test/jlarrays.jl +++ /dev/null @@ -1,19 +0,0 @@ -@testset for T in (Float32, Float64, Complex{Float32}, Complex{Float64}) - @testset "Copy with JLArrayStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint) - for m1 in (0, 16, 32), m2 in (0, 16, 32) - A1 = JLArray(randn(T, (m1, m2))) - A2 = similar(A1) - zA1 = JLArray(f1(zeros(T, (m1, m2)))) - zA2 = JLArray(f2(zeros(T, (m1, m2)))) - A1c = copy(A1) - A2c = copy(A2) - B1 = f1(StridedView(A1c)) - B2 = f2(StridedView(A2c)) - axes(f1(A1)) == axes(f2(A2)) || continue - @test collect(Matrix(copy!(f2(A2), f1(A1)))) == JLArrays.Adapt.adapt(Vector{T}, copy!(B2, B1)) - @test copy!(zA1, f1(A1)) == copy!(zA2, B1) - x = rand(T) - @test f1(StridedView(JLArrays.Adapt.adapt(Vector{T}, fill!(A1c, x)))) == JLArrays.Adapt.adapt(Vector{T}, fill!(B1, x)) - end - end -end diff --git a/test/mapreduce_tests.jl b/test/mapreduce_tests.jl new file mode 100644 index 0000000..46b3b34 --- /dev/null +++ b/test/mapreduce_tests.jl @@ -0,0 +1,145 @@ +# Parameterized mapreduce / map! tests. +# Iterates over both Array and JLArray backends internally. + +backends = [("Array", identity), ("JLArray", JLArray)] + +for (backend_name, make_arr) in backends + @testset "$backend_name: map! via StridedView" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + A = make_arr(rand(T, 8, 6)) + B = similar(A) + map!(x -> 2x, StridedView(B), StridedView(A)) + @test Array(StridedView(B)) ≈ 2 .* Array(StridedView(A)) + end + end + + @testset "$backend_name: mapreducedim! — sum over dim 1" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 1, 6)) + sum!(StridedView(B), StridedView(A)) + @test Array(StridedView(B)) ≈ sum(data; dims = 1) + end + end + + @testset "$backend_name: mapreducedim! — sum over dim 2" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 8, 1)) + sum!(StridedView(B), StridedView(A)) + @test Array(StridedView(B)) ≈ sum(data; dims = 2) + end + end + + @testset "$backend_name: map! with conj/adjoint StridedView" begin + for T in (ComplexF32, ComplexF64) + data = rand(T, 4, 4) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 4, 4)) + copy!(adjoint(StridedView(B)), StridedView(A)) + @test Array(StridedView(B)) ≈ adjoint(data) + end + end + + @testset "$backend_name: mapreduce — full scalar reduction" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + result = sum(StridedView(A)) + @test result isa T + @test result ≈ sum(data) + end + end + + # Only meaningful for GPU backends: mixing CPU and GPU inputs must not silently + # use the GPU dispatch path. + if make_arr !== identity + @testset "$backend_name: dispatch requires all inputs on GPU" begin + A_gpu = make_arr(rand(Float32, 4, 4)) + A_cpu = Array(StridedView(A_gpu)) + B_gpu = make_arr(zeros(Float32, 4, 4)) + @test_throws Exception map!(+, StridedView(B_gpu), StridedView(A_gpu), StridedView(A_cpu)) + end + end + + @testset "$backend_name: map! — stride-2 input (every other row)" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 4, 6)) + src = StridedView(A)[1:2:8, :] + map!(x -> 2x, StridedView(B), src) + @test Array(StridedView(B)) ≈ 2 .* data[1:2:8, :] + end + end + + @testset "$backend_name: map! — stride-2 output (every other row)" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 4, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 8, 6)) + dst = StridedView(B)[1:2:8, :] + map!(identity, dst, StridedView(A)) + B_cpu = Array(StridedView(B)) + @test B_cpu[1:2:8, :] ≈ data + @test all(iszero, B_cpu[2:2:8, :]) + end + end + + @testset "$backend_name: map! — subview with nonzero offset" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 5, 4)) + src = StridedView(A)[2:6, 3:6] + map!(x -> x + 1, StridedView(B), src) + @test Array(StridedView(B)) ≈ data[2:6, 3:6] .+ 1 + end + end + + @testset "$backend_name: map! — permuted (transposed) strides" begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + data = rand(T, 6, 8) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 8, 6)) + src = permutedims(StridedView(A), (2, 1)) + map!(identity, StridedView(B), src) + @test Array(StridedView(B)) ≈ permutedims(data, (2, 1)) + end + end + + @testset "$backend_name: sum over dim 1 — stride-2 input" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 1, 6)) + src = StridedView(A)[1:2:8, :] + sum!(StridedView(B), src) + @test Array(StridedView(B)) ≈ sum(data[1:2:8, :]; dims = 1) + end + end + + @testset "$backend_name: sum over dim 2 — subview with offset" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + B = make_arr(zeros(T, 5, 1)) + src = StridedView(A)[2:6, 2:5] + sum!(StridedView(B), src) + @test Array(StridedView(B)) ≈ sum(data[2:6, 2:5]; dims = 2) + end + end + + @testset "$backend_name: full scalar reduction — stride-2 and offset subview" begin + for T in (Float32, Float64) + data = rand(T, 8, 6) + A = make_arr(copy(data)) + r1 = sum(StridedView(A)[1:2:8, :]) + @test r1 ≈ sum(data[1:2:8, :]) + r2 = sum(StridedView(A)[3:6, 2:5]) + @test r2 ≈ sum(data[3:6, 2:5]) + end + end +end diff --git a/test/othertests.jl b/test/othertests.jl index 8c775ce..ea5fee1 100644 --- a/test/othertests.jl +++ b/test/othertests.jl @@ -1,157 +1,213 @@ +backends = [("Array", identity), ("JLArray", JLArray)] + @testset "in-place matrix operations" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - A1 = randn(T, (1000, 1000)) - A2 = similar(A1) - A1c = copy(A1) - A2c = copy(A2) - B1 = StridedView(A1c) - B2 = StridedView(A2c) - - @test conj!(A1) == conj!(B1) - @test adjoint!(A2, A1) == adjoint!(B2, B1) - @test transpose!(A2, A1) == transpose!(B2, B1) - @test permutedims!(A2, A1, (2, 1)) == permutedims!(B2, B1, (2, 1)) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + data1 = randn(T, (1000, 1000)) + data2 = randn(T, (1000, 1000)) + # CPU reference + A1 = copy(data1); A2 = copy(data2) + # Backend under test + B1 = StridedView(make_arr(copy(data1))) + B2 = StridedView(make_arr(copy(data2))) + + conj!(A1); conj!(B1) + @test A1 ≈ Array(B1) + adjoint!(A2, A1); adjoint!(B2, B1) + @test A2 ≈ Array(B2) + transpose!(A2, A1); transpose!(B2, B1) + @test A2 ≈ Array(B2) + permutedims!(A2, A1, (2, 1)); permutedims!(B2, B1, (2, 1)) + @test A2 ≈ Array(B2) + end end end @testset "map, scale!, axpy! and axpby! with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - @testset for N in 2:6 - dims = ntuple(n -> div(60, N), N) - R1, R2, R3 = rand(T, dims), rand(T, dims), rand(T, dims) - B1 = permutedims(StridedView(R1), randperm(N)) - B2 = permutedims(StridedView(R2), randperm(N)) - B3 = permutedims(StridedView(R3), randperm(N)) - A1 = convert(Array, B1) - A2 = convert(Array{T}, B2) # test different converts - A3 = convert(Array{T, N}, B3) - C1 = deepcopy(B1) - - @test rmul!(B1, 1 // 2) ≈ rmul!(A1, 1 // 2) - @test lmul!(1 // 3, B2) ≈ lmul!(1 // 3, A2) - @test axpy!(1 // 3, B1, B2) ≈ axpy!(1 // 3, A1, A2) - @test axpy!(1, B2, B3) ≈ axpy!(1, A2, A3) - @test axpby!(1 // 3, B1, 1 // 2, B3) ≈ axpby!(1 // 3, A1, 1 // 2, A3) - @test axpby!(1, B2, 1, B1) ≈ axpby!(1, A2, 1, A1) - @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) ≈ - map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3) - @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView - @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array - @test mul!(B1, 1, B2) ≈ mul!(A1, 1, A2) - @test mul!(B1, B2, 1) ≈ mul!(A1, A2, 1) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + @testset for N in 2:6 + dims = ntuple(n -> div(60, N), N) + perm1, perm2, perm3 = randperm(N), randperm(N), randperm(N) + R1_cpu, R2_cpu, R3_cpu = rand(T, dims), rand(T, dims), rand(T, dims) + R1 = make_arr(copy(R1_cpu)) + R2 = make_arr(copy(R2_cpu)) + R3 = make_arr(copy(R3_cpu)) + B1 = permutedims(StridedView(R1), perm1) + B2 = permutedims(StridedView(R2), perm2) + B3 = permutedims(StridedView(R3), perm3) + A1 = Array(B1) + A2 = Array(B2) + A3 = Array(B3) + + @test Array(rmul!(B1, 1 // 2)) ≈ rmul!(A1, 1 // 2) + @test Array(lmul!(1 // 3, B2)) ≈ lmul!(1 // 3, A2) + @test Array(axpy!(1 // 3, B1, B2)) ≈ axpy!(1 // 3, A1, A2) + @test Array(axpy!(1, B2, B3)) ≈ axpy!(1, A2, A3) + @test Array(axpby!(1 // 3, B1, 1 // 2, B3)) ≈ axpby!(1 // 3, A1, 1 // 2, A3) + @test Array(axpby!(1, B2, 1, B1)) ≈ axpby!(1, A2, 1, A1) + @test Array(map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3)) ≈ + map((x, y, z) -> sin(x) + y / exp(-abs(z)), A1, A2, A3) + @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, B2, B3) isa StridedView + if make_arr === identity + @test map((x, y, z) -> sin(x) + y / exp(-abs(z)), B1, A2, B3) isa Array + end + @test Array(mul!(B1, 1, B2)) ≈ mul!(A1, 1, A2) + @test Array(mul!(B1, B2, 1)) ≈ mul!(A1, A2, 1) + end end end end @testset "broadcast with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - R1, R2, R3 = rand(T, (10,)), rand(T, (10, 10)), rand(T, (10, 10, 10)) - B1 = StridedView(R1) - B2 = permutedims(StridedView(R2), randperm(2)) - B3 = permutedims(StridedView(R3), randperm(3)) - A1 = convert(Array, B1) - A2 = convert(Array{T}, B2) - A3 = convert(Array{T, 3}, B3) - - @test @inferred(B1 .+ sin.(B2 .- 3)) ≈ A1 .+ sin.(A2 .- 3) - @test @inferred(B2' .* B3 .- Ref(0.5)) ≈ A2' .* A3 .- Ref(0.5) - @test @inferred(B2' .* B3 .- max.(abs.(B1), real.(B3))) ≈ - A2' .* A3 .- max.(abs.(A1), real.(A3)) - - @test (B1 .+ sin.(B2 .- 3)) isa StridedView - @test (B2' .* B3 .- Ref(0.5)) isa StridedView - @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView - @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + R1_cpu = rand(T, (10,)) + R2_cpu = rand(T, (10, 10)) + R3_cpu = rand(T, (10, 10, 10)) + perm2, perm3 = randperm(2), randperm(3) + R1 = make_arr(copy(R1_cpu)) + R2 = make_arr(copy(R2_cpu)) + R3 = make_arr(copy(R3_cpu)) + B1 = StridedView(R1) + B2 = permutedims(StridedView(R2), perm2) + B3 = permutedims(StridedView(R3), perm3) + A1 = Array(B1) + A2 = Array(B2) + A3 = Array(B3) + + @test Array(@inferred(B1 .+ sin.(B2 .- 3))) ≈ A1 .+ sin.(A2 .- 3) + @test Array(@inferred(B2' .* B3 .- Ref(0.5))) ≈ A2' .* A3 .- Ref(0.5) + @test Array(@inferred(B2' .* B3 .- max.(abs.(B1), real.(B3)))) ≈ + A2' .* A3 .- max.(abs.(A1), real.(A3)) + + @test (B1 .+ sin.(B2 .- 3)) isa StridedView + @test (B2' .* B3 .- Ref(0.5)) isa StridedView + @test (B2' .* B3 .- max.(abs.(B1), real.(B3))) isa StridedView + if make_arr === identity + @test (B2' .* A3 .- max.(abs.(B1), real.(B3))) isa Array + end + end end end @testset "broadcast with zero-length StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - A1 = StridedView(zeros(T, (2, 0))) - A2 = StridedView(zeros(T, (2, 0))) - @test (A1 .+ A2) == StridedView(zeros(T, (2, 0))) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + A1 = StridedView(make_arr(zeros(T, (2, 0)))) + A2 = StridedView(make_arr(zeros(T, (2, 0)))) + @test Array(A1 .+ A2) == zeros(T, (2, 0)) + end end end @testset "mapreduce with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - R1 = rand(T, (10, 10, 10, 10, 10, 10)) - @test sum(R1; dims = (1, 3, 5)) ≈ sum(StridedView(R1); dims = (1, 3, 5)) - @test mapreduce(sin, +, R1; dims = (1, 3, 5)) ≈ - mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5)) - R2 = rand(T, (10, 10, 10)) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, identity, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ reshape(R2, (10, 1, 1, 10, 10, 1)) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, x -> 0, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) - R2c = copy(R2) - β = rand(T) - @test Strided._mapreducedim!( - sin, +, x -> β * x, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ - β .* reshape(R2, (10, 1, 1, 10, 10, 1)) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, x -> β, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6), init = β) - R2c = copy(R2) - @test Strided._mapreducedim!( - sin, +, conj, (10, 10, 10, 10, 10, 10), - ( - sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), - StridedView(R1), - ) - ) ≈ - mapreduce(sin, +, R1; dims = (2, 3, 6)) .+ - conj.(reshape(R2, (10, 1, 1, 10, 10, 1))) - - R3 = rand(T, (100, 100, 2)) - @test sum(R3; dims = (1, 2)) ≈ sum(StridedView(R3); dims = (1, 2)) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + R1_cpu = rand(T, (10, 10, 10, 10, 10, 10)) + R2_cpu = rand(T, (10, 10, 10)) + R1 = make_arr(copy(R1_cpu)) + R2 = make_arr(copy(R2_cpu)) + + @test sum(StridedView(R1); dims = (1, 3, 5)) isa StridedView + @test Array(sum(StridedView(R1); dims = (1, 3, 5))) ≈ sum(R1_cpu; dims = (1, 3, 5)) + @test Array(mapreduce(sin, +, StridedView(R1); dims = (1, 3, 5))) ≈ + mapreduce(sin, +, R1_cpu; dims = (1, 3, 5)) + + R2c = copy(R2) + @test Array( + Strided._mapreducedim!( + sin, +, identity, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + ) + ) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ reshape(R2_cpu, (10, 1, 1, 10, 10, 1)) + + R2c = copy(R2) + @test Array( + Strided._mapreducedim!( + sin, +, x -> 0, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + ) + ) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) + + R2c = copy(R2) + β = rand(T) + @test Array( + Strided._mapreducedim!( + sin, +, x -> β * x, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + ) + ) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ + β .* reshape(R2_cpu, (10, 1, 1, 10, 10, 1)) + + R2c = copy(R2) + @test Array( + Strided._mapreducedim!( + sin, +, x -> β, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + ) + ) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6), init = β) + + R2c = copy(R2) + @test Array( + Strided._mapreducedim!( + sin, +, conj, (10, 10, 10, 10, 10, 10), + ( + sreshape(StridedView(R2c), (10, 1, 1, 10, 10, 1)), + StridedView(R1), + ) + ) + ) ≈ + mapreduce(sin, +, R1_cpu; dims = (2, 3, 6)) .+ + conj.(reshape(R2_cpu, (10, 1, 1, 10, 10, 1))) + + R3_cpu = rand(T, (100, 100, 2)) + R3 = make_arr(copy(R3_cpu)) + @test Array(sum(StridedView(R3); dims = (1, 2))) ≈ sum(R3_cpu; dims = (1, 2)) + end end end @testset "complete reductions with StridedView" begin - @testset for T in (Float32, Float64, ComplexF32, ComplexF64) - R1 = rand(T, (10, 10, 10, 10, 10, 10)) - - @test sum(R1) ≈ sum(StridedView(R1)) - @test maximum(abs, R1) ≈ maximum(abs, StridedView(R1)) - @test minimum(real, R1) ≈ minimum(real, StridedView(R1)) - @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R1)) - - R2 = PermutedDimsArray(R1, (randperm(6)...,)) - - @test sum(R2) ≈ sum(StridedView(R2)) - @test maximum(abs, R2) ≈ maximum(abs, StridedView(R2)) - @test minimum(real, R2) ≈ minimum(real, StridedView(R2)) - @test sum(x -> real(x) < 0, R1) == sum(x -> real(x) < 0, StridedView(R2)) - - R3 = rand(T, (5, 5, 5)) - @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3))) + for (backend_name, make_arr) in backends + @testset "$T ($backend_name)" for T in (Float32, Float64, ComplexF32, ComplexF64) + R1_cpu = rand(T, (10, 10, 10, 10, 10, 10)) + R1 = make_arr(copy(R1_cpu)) + + @test sum(StridedView(R1)) ≈ sum(R1_cpu) + @test maximum(abs, StridedView(R1)) ≈ maximum(abs, R1_cpu) + @test minimum(real, StridedView(R1)) ≈ minimum(real, R1_cpu) + @test sum(x -> real(x) < 0, StridedView(R1)) == sum(x -> real(x) < 0, R1_cpu) + + perm = (randperm(6)...,) + R2_cpu = PermutedDimsArray(R1_cpu, perm) + R2 = PermutedDimsArray(R1, perm) + + @test sum(StridedView(R2)) ≈ sum(R2_cpu) + @test maximum(abs, StridedView(R2)) ≈ maximum(abs, R2_cpu) + @test minimum(real, StridedView(R2)) ≈ minimum(real, R2_cpu) + @test sum(x -> real(x) < 0, StridedView(R2)) == sum(x -> real(x) < 0, R1_cpu) + + R3_cpu = rand(T, (5, 5, 5)) + R3 = make_arr(copy(R3_cpu)) + @test prod(exp, StridedView(R3)) ≈ exp(sum(StridedView(R3))) + end end end diff --git a/test/runtests.jl b/test/runtests.jl index 4108876..80dd88b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,7 +11,9 @@ Random.seed!(1234) is_buildkite = get(ENV, "BUILDKITE", "false") == "true" if !is_buildkite - include("jlarrays.jl") + @testset "mapreduce tests" begin + include("mapreduce_tests.jl") + end println("Base.Threads.nthreads() = $(Base.Threads.nthreads())") println("Running tests single-threaded:")