From 1fdbca448fb74d978d1dc1c09647c96d5577e183 Mon Sep 17 00:00:00 2001 From: Chris Elrod Date: Mon, 24 Feb 2020 15:14:46 -0500 Subject: [PATCH] Also checking maximum small crush values. --- Manifest.toml | 8 ++-- Project.toml | 6 +-- src/PCG.jl | 118 ++++++++++++++++++++++++++++++++++++----------- test/runtests.jl | 26 +++++++---- 4 files changed, 115 insertions(+), 43 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 54980f6..070e713 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -37,9 +37,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[SIMDPirates]] deps = ["VectorizationBase"] -git-tree-sha1 = "ecacd3f808e559d9e363f2620041c6286c8efaca" +git-tree-sha1 = "4b1e0b1442fb4af5e6b93b9c7fdeacf287d2653b" uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a" -version = "0.4.0" +version = "0.5.0" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -53,6 +53,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[VectorizationBase]] deps = ["CpuId", "LinearAlgebra"] -git-tree-sha1 = "b9b5c8fa55e9b859989e759f405624d16b0b0ca2" +git-tree-sha1 = "9f8caaa5d033f88e188f62a3dba0dab5f429447a" uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" -version = "0.4.2" +version = "0.5.0" diff --git a/Project.toml b/Project.toml index de1022f..f9f7f24 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "VectorizedRNG" uuid = "33b4df10-0173-11e9-2a0c-851a7edac40e" authors = ["Chris Elrod "] -version = "0.1.1" +version = "0.1.2" [deps] Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" @@ -10,8 +10,8 @@ SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [compat] -SIMDPirates = "0.4" -VectorizationBase = "0.4.2" +SIMDPirates = "~0.5" +VectorizationBase = "~0.5" julia = "1" [extras] diff --git a/src/PCG.jl b/src/PCG.jl index 6c9cab3..484cbd1 100644 --- a/src/PCG.jl +++ b/src/PCG.jl @@ -53,13 +53,13 @@ end for n ∈ 0:N-1 n_quote = quote # state - SIMDPirates.storea!(gep(ptr, $n * $W64), (Base.Cartesian.@ntuple $W64 w -> Core.VecElement(rand(UInt64)))) + SIMDPirates.vstorea!(gep(ptr, $n * $W64), (Base.Cartesian.@ntuple $W64 w -> Core.VecElement(rand(UInt64)))) # multiplier - SIMDPirates.storea!(gep(ptr, $(N + n) * $W64), MULTIPLIERS[(Base.Threads.atomic_add!(MULT_NUMBER, 1) + offset * $N - 1) % $(length(MULTIPLIERS)) + 1]) + SIMDPirates.vstorea!(gep(ptr, $(N + n) * $W64), MULTIPLIERS[(Base.Threads.atomic_add!(MULT_NUMBER, 1) + offset * $N - 1) % $(length(MULTIPLIERS)) + 1]) end push!(q.args, n_quote) end - push!(q.args, :(VectorizationBase.store!(gep(ptr, $(2N)*$W64), one(UInt64) + 2 * ((MULT_NUMBER[] + offset * N - 1) ÷ $(length(MULTIPLIERS)))))) + push!(q.args, :(VectorizationBase.vstore!(gep(ptr, $(2N)*$W64), one(UInt64) + 2 * ((MULT_NUMBER[] + offset * N - 1) ÷ $(length(MULTIPLIERS)))))) push!(q.args, :pcg) q end @@ -115,24 +115,88 @@ out_tup_expr(N) = name_n_tup_expr(:out_, N) multiplier = Symbol(:multiplier_, n) push!(states.args, state); push!(multipliers.args, multiplier) push!(q.args, quote - $state = loada(Vec{$WV,UInt64}, prng + $(REGISTER_SIZE * (n-1))) - $multiplier = loada(Vec{$WV,UInt64}, prng + $(REGISTER_SIZE * (P + n-1))) + $state = vloada(Vec{$WV,UInt64}, prng + $(REGISTER_SIZE * (n-1))) + $multiplier = vloada(Vec{$WV,UInt64}, prng + $(REGISTER_SIZE * (P + n-1))) end) end push!(q.args, Expr(:tuple, states, multipliers, :increment)) q end +@inline function load_vectors(rng::AbstractPCG{4}, ::Val{1}, ::Val{WV}) where {WV} + prng = pointer(rng) + states = ( + vloada(Vec{WV,UInt64}, prng), + ) + multipliers = ( + vloada(Vec{WV,UInt64}, prng, 4W64), + ) + increment = vbroadcast(Vec{WV,UInt64}, gep(prng, 8W64)) + states, multipliers, increment +end +@inline function load_vectors(rng::AbstractPCG{4}, ::Val{2}, ::Val{WV}) where {WV} + prng = pointer(rng) + states = ( + vloada(Vec{WV,UInt64}, prng), + vloada(Vec{WV,UInt64}, prng, W64) + ) + multipliers = ( + vloada(Vec{WV,UInt64}, prng, 4W64), + vloada(Vec{WV,UInt64}, prng, 5W64) + ) + increment = vbroadcast(Vec{WV,UInt64}, gep(prng, 8W64)) + states, multipliers, increment +end +@inline function load_vectors(rng::AbstractPCG{4}, ::Val{3}, ::Val{WV}) where {WV} + prng = pointer(rng) + states = ( + vloada(Vec{WV,UInt64}, prng), + vloada(Vec{WV,UInt64}, prng, W64), + vloada(Vec{WV,UInt64}, prng, 2W64) + ) + multipliers = ( + vloada(Vec{WV,UInt64}, prng, 4W64), + vloada(Vec{WV,UInt64}, prng, 5W64), + vloada(Vec{WV,UInt64}, prng, 6W64) + ) + increment = vbroadcast(Vec{WV,UInt64}, gep(prng, 8W64)) + states, multipliers, increment +end +@inline function load_vectors(rng::AbstractPCG{4}, ::Val{4}, ::Val{WV}) where {WV} + prng = pointer(rng) + states = ( + vloada(Vec{WV,UInt64}, prng), + vloada(Vec{WV,UInt64}, prng, W64), + vloada(Vec{WV,UInt64}, prng, 2W64), + vloada(Vec{WV,UInt64}, prng, 3W64) + ) + multipliers = ( + vloada(Vec{WV,UInt64}, prng, 4W64), + vloada(Vec{WV,UInt64}, prng, 5W64), + vloada(Vec{WV,UInt64}, prng, 6W64), + vloada(Vec{WV,UInt64}, prng, 7W64) + ) + increment = vbroadcast(Vec{WV,UInt64}, gep(prng, 8W64)) + states, multipliers, increment +end @generated function store_state!(rng::AbstractPCG, states::NTuple{N,Vec{W,T}}) where {N,W,T} q = quote # $(Expr(:meta,:inline)) prng = pointer(rng) end for n ∈ 1:N - push!(q.args, :(storea!(prng + $(REGISTER_SIZE * (n-1)), @inbounds($(Expr(:ref, :states, n)))))) + push!(q.args, :(vstorea!(prng + $(REGISTER_SIZE * (n-1)), @inbounds($(Expr(:ref, :states, n)))))) end push!(q.args, nothing) q end +@inline function store_state!(rng::AbstractPCG, states::NTuple{4,Vec{W,T}}) where {W,T} + prng = pointer(rng) + vstorea!(prng, (@inbounds states[1])) + vstorea!(prng, (@inbounds states[2]), W64) + vstorea!(prng, (@inbounds states[3]), 2W64) + vstorea!(prng, (@inbounds states[4]), 3W64) + nothing +end @noinline function rand_pcgPCG_RXS_M_XS_int64_quote(N, WV, Nreps) q = quote end @@ -474,17 +538,17 @@ function random_sample!(f, rng::AbstractPCG{2}, x::AbstractArray{Float64}) n = 0 while n < N + 1 - 2W64 state, (z₁,z₂) = f(state, mult, incr, Val{2}(), Float64) - store!(ptrx, z₁, n); n += W64 - store!(ptrx, z₂, n); n += W64 + vstore!(ptrx, z₁, n); n += W64 + vstore!(ptrx, z₂, n); n += W64 end mask = VectorizationBase.masktable(Val{W64}(), N & (W64-1)) if n < N - 1W64 state, (z₁,z₂) = f(state, mult, incr, Val{2}(), Float64) - store!(ptrx, z₁, n); n += W64 - store!(ptrx, z₂, n, mask); + vstore!(ptrx, z₁, n); n += W64 + vstore!(ptrx, z₂, n, mask); elseif n < N - state, (z₁,) = f(state, mult, incr, Val{1}(), Float64) - store!(ptrx, z₁, n, mask); + vstate, (z₁,) = f(state, mult, incr, Val{1}(), Float64) + vstore!(ptrx, z₁, n, mask); end store_state!(rng, state) end # GC preserve @@ -498,30 +562,30 @@ function random_sample!(f, rng::AbstractPCG{4}, x::AbstractArray{Float64}) n = 0 while n < N + 1 - 4W64 state, (z₁,z₂,z₃,z₄) = f(state, mult, incr, Val{4}(), Float64) - store!(ptrx, z₁, n); n += W64 - store!(ptrx, z₂, n); n += W64 - store!(ptrx, z₃, n); n += W64 - store!(ptrx, z₄, n); n += W64 + vstore!(ptrx, z₁, n); n += W64 + vstore!(ptrx, z₂, n); n += W64 + vstore!(ptrx, z₃, n); n += W64 + vstore!(ptrx, z₄, n); n += W64 end mask = VectorizationBase.masktable(Val{W64}(), N & (W64-1)) if n < N - 3W64 state, (z₁,z₂,z₃,z₄) = f(state, mult, incr, Val{4}(), Float64) - store!(ptrx, z₁, n); n += W64 - store!(ptrx, z₂, n); n += W64 - store!(ptrx, z₃, n); n += W64 - store!(ptrx, z₄, n, mask); + vstore!(ptrx, z₁, n); n += W64 + vstore!(ptrx, z₂, n); n += W64 + vstore!(ptrx, z₃, n); n += W64 + vstore!(ptrx, z₄, n, mask); elseif n < N - 2W64 state, (z₁,z₂,z₃) = f(state, mult, incr, Val{3}(), Float64) - store!(ptrx, z₁, n); n += W64 - store!(ptrx, z₂, n); n += W64 - store!(ptrx, z₃, n, mask); + vstore!(ptrx, z₁, n); n += W64 + vstore!(ptrx, z₂, n); n += W64 + vstore!(ptrx, z₃, n, mask); elseif n < N - W64 state, (z₁,z₂) = f(state, mult, incr, Val{2}(), Float64) - store!(ptrx, z₁, n); n += W64 - store!(ptrx, z₂, n, mask); + vstore!(ptrx, z₁, n); n += W64 + vstore!(ptrx, z₂, n, mask); elseif n < N - state, (z₁,) = f(state, mult, incr, Val{1}(), Float64) - store!(ptrx, z₁, n, mask); + vstate, (z₁,) = f(state, mult, incr, Val{1}(), Float64) + vstore!(ptrx, z₁, n, mask); end store_state!(rng, state) end # GC preserve diff --git a/test/runtests.jl b/test/runtests.jl index a3a65de..3c326be 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,14 +3,18 @@ using Test using RNGTest, Distributions, Random -function smallcrushminp(res) +function smallcrushextrema(res) r1 = Base.Cartesian.@ntuple 5 i -> (res[i])::Float64 r2 = res[6]::Tuple{Float64,Float64} r3 = Base.Cartesian.@ntuple 3 i -> (res[i+6])::Float64 r4 = res[10]::NTuple{5,Float64} - min( + mi = min( minimum(r1), minimum(r2), minimum(r3), minimum(r4) ) + ma = max( + maximum(r1), maximum(r2), maximum(r3), maximum(r4) + ) + mi, ma end const INVSQRT2 = 1/sqrt(2) @@ -26,15 +30,15 @@ function normalcdf!(x::AbstractVector{T}) where {T} i = 0 for _ ∈ 1:(N >>> Wshift) ptrxᵢ = VectorizedRNG.VectorizationBase.gep(ptrx, i) - v = VectorizedRNG.SIMDPirates.load(W, ptrxᵢ) - VectorizedRNG.SIMDPirates.store!(ptrxᵢ, normalcdf(v)) + v = VectorizedRNG.SIMDPirates.vload(W, ptrxᵢ) + VectorizedRNG.SIMDPirates.vstore!(ptrxᵢ, normalcdf(v)) i += _W end if i < N ptrxᵢ = VectorizedRNG.VectorizationBase.gep(ptrx, i) mask = VectorizedRNG.VectorizationBase.mask(T, N & (_W - 1)) - v = VectorizedRNG.SIMDPirates.load(W, ptrxᵢ, mask) - VectorizedRNG.SIMDPirates.store!(ptrxᵢ, normalcdf(v), mask) + v = VectorizedRNG.SIMDPirates.vload(W, ptrxᵢ, mask) + VectorizedRNG.SIMDPirates.vstore!(ptrxᵢ, normalcdf(v), mask) end x end @@ -51,15 +55,19 @@ end @testset "VectorizedRNG.jl" begin # Write your own tests here. - + α = 1e-4 rngunif = RNGTest.wrap(local_pcg(), Float64); res = RNGTest.smallcrushJulia(rngunif) - @test smallcrushminp(res) > eps() + mi, ma = smallcrushextrema(res) + @test mi > α + @test ma < 1 - α rngnorm = RNGTest.wrap(RandNormal01(local_pcg()), Float64); res = RNGTest.smallcrushJulia(rngnorm) - @test smallcrushminp(res) > eps() + mi, ma = smallcrushextrema(res) + @test mi > α + @test ma < 1 - α end