From 36361f28d75f5a7a0a4083b073ebc2ccb862b4ec Mon Sep 17 00:00:00 2001 From: Thore Kockerols Date: Sat, 14 Jun 2025 19:31:18 +0200 Subject: [PATCH 1/2] Add threaded buffer variants --- benchmark/sparse_buffer_mult.jl | 292 ++++++++++++++++++++++++ benchmark/sparse_buffer_mult_output.txt | 37 +++ 2 files changed, 329 insertions(+) create mode 100644 benchmark/sparse_buffer_mult.jl create mode 100644 benchmark/sparse_buffer_mult_output.txt diff --git a/benchmark/sparse_buffer_mult.jl b/benchmark/sparse_buffer_mult.jl new file mode 100644 index 000000000..c2cf97386 --- /dev/null +++ b/benchmark/sparse_buffer_mult.jl @@ -0,0 +1,292 @@ +using SparseArrays +using BenchmarkTools +using Base.Threads +using OhMyThreads: tforeach +using LoopVectorization + +""" + spmm_buffer!(C, A, B) + +Multiply sparse matrices `A` and `B` storing the result in the +preallocated sparse matrix `C`. + +The sparsity pattern of `C` should match the expected result. +This avoids allocations when repeatedly multiplying the same sized +matrices. +""" +function spmm_buffer!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + fill!(C.nzval, zero(eltype(C))) + SparseArrays._spmatmul!(C, A, B, one(eltype(C)), zero(eltype(C))) + return C +end + + +""" + _col_map!(colmap, C, j) + +Fill `colmap` with a lookup for column `j` of `C`. All entries are reset to +zero so the same vector can be reused across columns without additional +allocations. +""" +function _col_map!(colmap::Vector{Int}, C::SparseMatrixCSC, j::Integer) + fill!(colmap, 0) + @inbounds for p in C.colptr[j]:(C.colptr[j+1]-1) + colmap[C.rowval[p]] = p + end + return colmap +end + + +""" + spmm_buffer_fast!(C, A, B) + +Same as [`spmm_buffer_fast!`](@ref) but builds the lookup table internally. +This version is convenient when the map is not available beforehand. +""" +function spmm_buffer_fast!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + n = size(C, 1) + colmap = zeros(Int, n) + fill!(C.nzval, zero(eltype(C))) + @inbounds for j in 1:size(B,2) + _col_map!(colmap, C, j) + for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa]; a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] += a * b + end + end + end + end + return C +end + +""" + spmm_buffer_col!(C, A, B) + +Column oriented buffered multiplication that builds the lookup table on each +call. Useful for quick tests but allocates the map. +""" +function spmm_buffer_col!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + n = size(C, 1) + colmap = zeros(Int, n) + fill!(C.nzval, zero(eltype(C))) + @inbounds for j in 1:size(B,2) + _col_map!(colmap, C, j) + for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa]; a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] += a * b + end + end + end + end + return C +end + +""" + spmm_buffer_threaded!(C, A, B) + +Threaded buffered multiplication with internal map creation. +""" +function spmm_buffer_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC, α, β) + n = size(C, 1) + maps = [zeros(Int, n) for _ in 1:nthreads()] + if β == zero(β) + fill!(C.nzval, zero(eltype(C))) + else + @. C.nzval *= β + end + @threads for j in 1:size(B,2) + colmap = maps[threadid()] + _col_map!(colmap, C, j) + @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa]; a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] += α * a * b + end + end + end + end + return C +end + +spmm_buffer_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) = + spmm_buffer_threaded!(C, A, B, one(eltype(C)), zero(eltype(C))) + +function spmm_buffer_simd!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + n = size(C, 1) + colmap = zeros(Int, n) + fill!(C.nzval, zero(eltype(C))) + @inbounds for j in 1:size(B,2) + _col_map!(colmap, C, j) + for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + @simd for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa]; a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] = muladd(a, b, C.nzval[idx]) + end + end + end + end + return C +end + +function spmm_buffer_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + n = size(C, 1) + maps = [zeros(Int, n) for _ in 1:Threads.nthreads()] + fill!(C.nzval, zero(eltype(C))) + tforeach(1:size(B,2)) do j + colmap = maps[Threads.threadid()] + _col_map!(colmap, C, j) + @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa]; a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] += a * b + end + end + end + end + return C +end + +function spmm_buffer_simd_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + n = size(C, 1) + maps = [zeros(Int, n) for _ in 1:nthreads()] + fill!(C.nzval, zero(eltype(C))) + @threads for j in 1:size(B,2) + colmap = maps[threadid()] + _col_map!(colmap, C, j) + @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + @simd for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa]; a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] = muladd(a, b, C.nzval[idx]) + end + end + end + end + return C +end + +function spmm_buffer_simd_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) + n = size(C, 1) + maps = [zeros(Int, n) for _ in 1:Threads.nthreads()] + fill!(C.nzval, zero(eltype(C))) + tforeach(1:size(B,2)) do j + colmap = maps[Threads.threadid()] + _col_map!(colmap, C, j) + @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + @simd for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa]; a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] = muladd(a, b, C.nzval[idx]) + end + end + end + end + return C +end + + +spmm_no_buffer(A::SparseMatrixCSC, B::SparseMatrixCSC) = A * B + +function random_sparse(n, m, nnz; T=Float64) + I = rand(1:n, nnz) + J = rand(1:m, nnz) + V = rand(T, nnz) + sparse(I, J, V, n, m) +end + +function expand_pattern(pattern::SparseMatrixCSC, extra::Integer) + C = copy(pattern) + n, m = size(pattern) + added = 0 + while added < extra + i = rand(1:n) + j = rand(1:m) + if C[i, j] == 0 + C[i, j] = one(eltype(C)) + added += 1 + end + end + fill!(C.nzval, zero(eltype(C))) + return C +end + +function bench_case(n, p, m, nnzA, nnzB; extra_ratio=0.0) + A = random_sparse(n, p, nnzA) + B = random_sparse(p, m, nnzB) + + pattern = A * B + if extra_ratio > 0 + extra = round(Int, nnz(pattern) * extra_ratio) + pattern = expand_pattern(pattern, extra) + end + + C1 = copy(pattern); fill!(C1.nzval, zero(eltype(C1))) + C2 = copy(pattern); fill!(C2.nzval, zero(eltype(C2))) + C3 = copy(pattern); fill!(C3.nzval, zero(eltype(C3))) + C4 = copy(pattern); fill!(C4.nzval, zero(eltype(C4))) + C5 = copy(pattern); fill!(C5.nzval, zero(eltype(C5))) + C6 = copy(pattern); fill!(C6.nzval, zero(eltype(C6))) + C7 = copy(pattern); fill!(C7.nzval, zero(eltype(C7))) + + println("non zeros in A: $(nnz(A)) B: $(nnz(B)) pattern: $(nnz(pattern))") + println("buffer basic") + @btime spmm_buffer_fast!($C1, $A, $B) + println("buffer threaded") + @btime spmm_buffer_threaded!($C2, $A, $B) + println("buffer simd") + @btime spmm_buffer_simd!($C3, $A, $B) + println("buffer omt") + @btime spmm_buffer_omt!($C4, $A, $B) + println("buffer simd threaded") + @btime spmm_buffer_simd_threaded!($C5, $A, $B) + println("buffer simd omt") + @btime spmm_buffer_simd_omt!($C6, $A, $B) + println("buffer threaded five args") + @btime spmm_buffer_threaded!($C7, $A, $B, 1.0, 0.0) + println("no buffer") + @btime spmm_no_buffer($A, $B) + + C_ref = spmm_no_buffer(A, B) + @assert isapprox(C1, C_ref; atol=1e-12, rtol=1e-8) + @assert isapprox(C2, C_ref; atol=1e-12, rtol=1e-8) + @assert isapprox(C3, C_ref; atol=1e-12, rtol=1e-8) + @assert isapprox(C4, C_ref; atol=1e-12, rtol=1e-8) + @assert isapprox(C5, C_ref; atol=1e-12, rtol=1e-8) + @assert isapprox(C6, C_ref; atol=1e-12, rtol=1e-8) + @assert isapprox(C7, C_ref; atol=1e-12, rtol=1e-8) +end + +function run() + n = 1000 + p = 1000 + m = 1000 + nnz_values = round.(Int, 10 .^ range(3, log10(2e5); length=7)) + for nnzA in nnz_values + nnzB = nnzA + println("\n---- nnz = $nnzA ----") + bench_case(n, p, m, nnzA, nnzB) + end +end + +run() diff --git a/benchmark/sparse_buffer_mult_output.txt b/benchmark/sparse_buffer_mult_output.txt new file mode 100644 index 000000000..9b852ba33 --- /dev/null +++ b/benchmark/sparse_buffer_mult_output.txt @@ -0,0 +1,37 @@ + +---- nnz = 1000 ---- +non zeros in A: 999 B: 999 pattern: 983 +buffer basic + 52.428 μs (3 allocations: 7.88 KiB) +buffer threaded + 19.601 μs (44 allocations: 42.80 KiB) +buffer simd + 52.234 μs (3 allocations: 7.88 KiB) +buffer omt + 21.612 μs (64 allocations: 43.69 KiB) +buffer simd threaded + 20.130 μs (44 allocations: 42.73 KiB) +buffer simd omt + 21.000 μs (64 allocations: 43.69 KiB) +buffer threaded five args + 18.944 μs (44 allocations: 42.80 KiB) +no buffer + 14.522 μs (11 allocations: 42.05 KiB) + +---- nnz = 2418 ---- +non zeros in A: 2414 B: 2414 pattern: 5860 +buffer basic + 65.425 μs (3 allocations: 7.88 KiB) +buffer threaded + 23.428 μs (44 allocations: 42.80 KiB) +buffer simd + 64.872 μs (3 allocations: 7.88 KiB) +buffer omt + 25.715 μs (64 allocations: 43.69 KiB) +buffer simd threaded + 22.563 μs (44 allocations: 42.73 KiB) +buffer simd omt + 25.541 μs (64 allocations: 43.69 KiB) +buffer threaded five args + 22.690 μs (44 allocations: 42.80 KiB) +no buffer From a239fcfcad6866cda80d204f5c9ae3198b9cc0d2 Mon Sep 17 00:00:00 2001 From: thorek1 Date: Sun, 15 Jun 2025 21:23:06 +0200 Subject: [PATCH 2/2] add task version --- benchmark/sparse_buffer_mult.jl | 232 +++++++++++++++++++++----------- 1 file changed, 156 insertions(+), 76 deletions(-) diff --git a/benchmark/sparse_buffer_mult.jl b/benchmark/sparse_buffer_mult.jl index c2cf97386..e25ae849b 100644 --- a/benchmark/sparse_buffer_mult.jl +++ b/benchmark/sparse_buffer_mult.jl @@ -1,7 +1,7 @@ using SparseArrays using BenchmarkTools using Base.Threads -using OhMyThreads: tforeach +using OhMyThreads: tforeach, TaskLocalValue using LoopVectorization """ @@ -28,7 +28,7 @@ Fill `colmap` with a lookup for column `j` of `C`. All entries are reset to zero so the same vector can be reused across columns without additional allocations. """ -function _col_map!(colmap::Vector{Int}, C::SparseMatrixCSC, j::Integer) +function _col_map!(colmap::Vector{I}, C::SparseMatrixCSC, j::Int) where I <: Integer fill!(colmap, 0) @inbounds for p in C.colptr[j]:(C.colptr[j+1]-1) colmap[C.rowval[p]] = p @@ -45,7 +45,7 @@ This version is convenient when the map is not available beforehand. """ function spmm_buffer_fast!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) n = size(C, 1) - colmap = zeros(Int, n) + colmap = zeros(UInt, n) fill!(C.nzval, zero(eltype(C))) @inbounds for j in 1:size(B,2) _col_map!(colmap, C, j) @@ -71,7 +71,7 @@ call. Useful for quick tests but allocates the map. """ function spmm_buffer_col!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) n = size(C, 1) - colmap = zeros(Int, n) + colmap = zeros(UInt, n) fill!(C.nzval, zero(eltype(C))) @inbounds for j in 1:size(B,2) _col_map!(colmap, C, j) @@ -96,7 +96,7 @@ Threaded buffered multiplication with internal map creation. """ function spmm_buffer_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC, α, β) n = size(C, 1) - maps = [zeros(Int, n) for _ in 1:nthreads()] + maps = [zeros(UInt, n) for _ in 1:nthreads()] if β == zero(β) fill!(C.nzval, zero(eltype(C))) else @@ -122,90 +122,147 @@ end spmm_buffer_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) = spmm_buffer_threaded!(C, A, B, one(eltype(C)), zero(eltype(C))) -function spmm_buffer_simd!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) - n = size(C, 1) - colmap = zeros(Int, n) - fill!(C.nzval, zero(eltype(C))) - @inbounds for j in 1:size(B,2) - _col_map!(colmap, C, j) - for pb in B.colptr[j]:(B.colptr[j+1]-1) - k = B.rowval[pb]; b = B.nzval[pb] - @simd for pa in A.colptr[k]:(A.colptr[k+1]-1) - i = A.rowval[pa]; a = A.nzval[pa] - idx = colmap[i] - if idx != 0 - C.nzval[idx] = muladd(a, b, C.nzval[idx]) - end - end - end + +function spmm_buffer_tasks!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC, α, β) + n = size(C, 1) + nB = size(B, 2) + nth = nthreads() + sz = cld(nB, nth) # columns per chunk + + # scale or zero C.nzval + if β == zero(β) + fill!(C.nzval, zero(eltype(C))) + else + @. C.nzval *= β end - return C -end -function spmm_buffer_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) - n = size(C, 1) - maps = [zeros(Int, n) for _ in 1:Threads.nthreads()] - fill!(C.nzval, zero(eltype(C))) - tforeach(1:size(B,2)) do j - colmap = maps[Threads.threadid()] - _col_map!(colmap, C, j) - @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) - k = B.rowval[pb]; b = B.nzval[pb] - for pa in A.colptr[k]:(A.colptr[k+1]-1) - i = A.rowval[pa]; a = A.nzval[pa] - idx = colmap[i] - if idx != 0 - C.nzval[idx] += a * b + tasks = Vector{Task}(undef, nth) + for t in 1:nth + j0 = (t-1)*sz + 1 + j1 = min(t*sz, nB) + tasks[t] = @spawn begin + colmap = zeros(UInt, n) + for j in j0:j1 + _col_map!(colmap, C, j) + @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) + k = B.rowval[pb]; b = B.nzval[pb] + @inbounds @simd for pa in A.colptr[k]:(A.colptr[k+1]-1) + i = A.rowval[pa] + a = A.nzval[pa] + idx = colmap[i] + if idx != 0 + C.nzval[idx] += α * a * b + end + end end end end end + + # wait for all tasks to finish + foreach(fetch, tasks) + return C end -function spmm_buffer_simd_threaded!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) - n = size(C, 1) - maps = [zeros(Int, n) for _ in 1:nthreads()] - fill!(C.nzval, zero(eltype(C))) - @threads for j in 1:size(B,2) - colmap = maps[threadid()] +spmm_buffer_tasks!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) = + spmm_buffer_tasks!(C, A, B, one(eltype(C)), zero(eltype(C))) + + +# function spmm_buffer_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) +# n = size(C, 1) +# maps = [zeros(Int, n) for _ in 1:Threads.nthreads()] +# fill!(C.nzval, zero(eltype(C))) +# tforeach(1:size(B,2)) do j +# colmap = maps[Threads.threadid()] +# _col_map!(colmap, C, j) +# @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) +# k = B.rowval[pb]; b = B.nzval[pb] +# for pa in A.colptr[k]:(A.colptr[k+1]-1) +# i = A.rowval[pa]; a = A.nzval[pa] +# idx = colmap[i] +# if idx != 0 +# C.nzval[idx] += a * b +# end +# end +# end +# end +# return C +# end + + +function spmm_buffer_omt_fast!(C::SparseMatrixCSC{Tv,Int}, + A::SparseMatrixCSC{Tv,Int}, + B::SparseMatrixCSC{Tv,Int}) where {Tv<:Number} + n = size(C,1) + # zero result + fill!(C.nzval, zero(Tv)) + + # task-local map buffer + tlv = TaskLocalValue{Vector{Int}}(() -> zeros(UInt, n)) + + # hoist field accesses + Acolptr, Arowval, Anzval = A.colptr, A.rowval, A.nzval + Bcolptr, Browval, Bnzval = B.colptr, B.rowval, B.nzval + Ccolptr, Cnzval = C.colptr, C.nzval + + # parallel over columns of B + tforeach(1:size(B,2)) do j + colmap = tlv[] # zeroed Int[n] _col_map!(colmap, C, j) - @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) - k = B.rowval[pb]; b = B.nzval[pb] - @simd for pa in A.colptr[k]:(A.colptr[k+1]-1) - i = A.rowval[pa]; a = A.nzval[pa] + + # precompute B’s column slice + pb_start = Bcolptr[j] + pb_end = Bcolptr[j+1] - 1 + + @inbounds for pb in pb_start:pb_end + k = Browval[pb]; b = Bnzval[pb] + # precompute A’s column slice + pa_start = Acolptr[k] + pa_end = Acolptr[k+1] - 1 + + @inbounds @simd for pa in pa_start:pa_end + i = Arowval[pa] + a = Anzval[pa] idx = colmap[i] if idx != 0 - C.nzval[idx] = muladd(a, b, C.nzval[idx]) + Cnzval[idx] += a * b end end end end + return C end -function spmm_buffer_simd_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) +function spmm_buffer_omt!(C::SparseMatrixCSC, A::SparseMatrixCSC, B::SparseMatrixCSC) n = size(C, 1) - maps = [zeros(Int, n) for _ in 1:Threads.nthreads()] + # zero out result fill!(C.nzval, zero(eltype(C))) - tforeach(1:size(B,2)) do j - colmap = maps[Threads.threadid()] + + # create a task-local map vector, initialised once per task + tlv = TaskLocalValue{Vector{UInt}}(() -> zeros(UInt, n)) + + # loop over columns of B in parallel + tforeach(1:size(B, 2)) do j + colmap = tlv[] # per-task zeroed Int[n] _col_map!(colmap, C, j) - @inbounds for pb in B.colptr[j]:(B.colptr[j+1]-1) + + @inbounds for pb in B.colptr[j]:(B.colptr[j+1] - 1) k = B.rowval[pb]; b = B.nzval[pb] - @simd for pa in A.colptr[k]:(A.colptr[k+1]-1) + for pa in A.colptr[k]:(A.colptr[k+1] - 1) i = A.rowval[pa]; a = A.nzval[pa] idx = colmap[i] if idx != 0 - C.nzval[idx] = muladd(a, b, C.nzval[idx]) + C.nzval[idx] += a * b end end end end + return C end - spmm_no_buffer(A::SparseMatrixCSC, B::SparseMatrixCSC) = A * B function random_sparse(n, m, nnz; T=Float64) @@ -252,36 +309,40 @@ function bench_case(n, p, m, nnzA, nnzB; extra_ratio=0.0) println("non zeros in A: $(nnz(A)) B: $(nnz(B)) pattern: $(nnz(pattern))") println("buffer basic") @btime spmm_buffer_fast!($C1, $A, $B) - println("buffer threaded") - @btime spmm_buffer_threaded!($C2, $A, $B) - println("buffer simd") - @btime spmm_buffer_simd!($C3, $A, $B) - println("buffer omt") - @btime spmm_buffer_omt!($C4, $A, $B) - println("buffer simd threaded") - @btime spmm_buffer_simd_threaded!($C5, $A, $B) - println("buffer simd omt") - @btime spmm_buffer_simd_omt!($C6, $A, $B) - println("buffer threaded five args") - @btime spmm_buffer_threaded!($C7, $A, $B, 1.0, 0.0) + # println("buffer threaded") + # @btime spmm_buffer_threaded!($C2, $A, $B) + # println("buffer SparseArrays") + # @btime spmm_buffer!($C3, $A, $B) + # println("buffer simd") + # @btime spmm_buffer_simd!($C3, $A, $B) + # println("buffer omt") + # @btime spmm_buffer_omt!($C3, $A, $B) + println("buffer tasks") + @btime spmm_buffer_tasks!($C4, $A, $B) + # println("buffer simd threaded") + # @btime spmm_buffer_simd_threaded!($C5, $A, $B) + # println("buffer simd omt") + # @btime spmm_buffer_simd_omt!($C6, $A, $B) + # println("buffer threaded five args") + # @btime spmm_buffer_threaded!($C7, $A, $B, 1.0, 0.0) println("no buffer") @btime spmm_no_buffer($A, $B) C_ref = spmm_no_buffer(A, B) @assert isapprox(C1, C_ref; atol=1e-12, rtol=1e-8) - @assert isapprox(C2, C_ref; atol=1e-12, rtol=1e-8) - @assert isapprox(C3, C_ref; atol=1e-12, rtol=1e-8) + # @assert isapprox(C2, C_ref; atol=1e-12, rtol=1e-8) + # @assert isapprox(C3, C_ref; atol=1e-12, rtol=1e-8) @assert isapprox(C4, C_ref; atol=1e-12, rtol=1e-8) - @assert isapprox(C5, C_ref; atol=1e-12, rtol=1e-8) - @assert isapprox(C6, C_ref; atol=1e-12, rtol=1e-8) - @assert isapprox(C7, C_ref; atol=1e-12, rtol=1e-8) + # @assert isapprox(C5, C_ref; atol=1e-12, rtol=1e-8) + # @assert isapprox(C6, C_ref; atol=1e-12, rtol=1e-8) + # @assert isapprox(C7, C_ref; atol=1e-12, rtol=1e-8) end function run() n = 1000 - p = 1000 - m = 1000 - nnz_values = round.(Int, 10 .^ range(3, log10(2e5); length=7)) + p = 100000 + m = 100000 + nnz_values = round.(Int, 10 .^ range(4, 6; length=3)) for nnzA in nnz_values nnzB = nnzA println("\n---- nnz = $nnzA ----") @@ -290,3 +351,22 @@ function run() end run() + + + +A = random_sparse(100, 1000000, Int(1e5)) +B = random_sparse(1000000, 1000000, Int(1e5)) +C = A * B +C.nzval .= 0 + +@profview for i in 1:100 spmm_buffer_tasks!(C,A,B) end +@profview for i in 1:1000 spmm_buffer_fast!(C,A,B) end + +using ThreadedSparseArrays +thA = ThreadedSparseMatrixCSC(A) +thB = ThreadedSparseMatrixCSC(B) + +@benchmark $thA * $thB +@benchmark $A * $B +@benchmark spmm_buffer_tasks!($C,$A,$B) +@benchmark spmm_buffer_fast!($C,$A,$B) \ No newline at end of file