Skip to content

Commit

Permalink
Make GridNeighborhoodSearch with FullGridCellList run on the GPU (#…
Browse files Browse the repository at this point in the history
…45)

* Make code run on the GPU

* Use GPUArraysCore.jl to import `AbstractGPUArray`

* Implement suggestions
  • Loading branch information
efaulhaber authored Jul 1, 2024
1 parent 537bf15 commit f407b7d
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 59 deletions.
4 changes: 4 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@ version = "0.4.1-dev"
[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[compat]
Atomix = "0.1"
GPUArraysCore = "0.1"
KernelAbstractions = "0.9"
LinearAlgebra = "1"
Polyester = "0.7.5"
Reexport = "1"
Expand Down
4 changes: 3 additions & 1 deletion src/PointNeighbors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ using Reexport: @reexport

using Adapt: Adapt
using Atomix: Atomix
using GPUArraysCore: AbstractGPUArray
using KernelAbstractions: KernelAbstractions, @kernel, @index
using LinearAlgebra: dot
using Polyester: @batch
using Polyester: Polyester
@reexport using StaticArrays: SVector

include("util.jl")
Expand Down
3 changes: 3 additions & 0 deletions src/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch)
return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells,
cell_size, update_buffer, nhs.update_strategy)
end

# This is useful to pass the backend directly to `@threaded`
KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend
32 changes: 28 additions & 4 deletions src/neighborhood_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -130,19 +130,43 @@ Note that `system_coords` and `neighbor_coords` can be identical.
See also [`initialize!`](@ref), [`update!`](@ref).
"""
function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search;
points = axes(system_coords, 2),
parallel = true) where {T}
parallel::Union{Bool, KernelAbstractions.Backend} = true,
points = axes(system_coords, 2)) where {T}
# The type annotation above is to make Julia specialize on the type of the function.
# Otherwise, unspecialized code will cause a lot of allocations
# and heavily impact performance.
# See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
if parallel isa Bool
# When `false` is passed, run serially. When `true` is passed, run either a
# threaded loop with `Polyester.@batch`, or, when `system_coords` is a GPU array,
# launch the loop as a kernel on the GPU.
parallel_ = Val(parallel)
elseif parallel isa KernelAbstractions.Backend
# WARNING! Undocumented, experimental feature:
# When a `KernelAbstractions.Backend` is passed, launch the loop as a GPU kernel
# on this backend. This is useful to test the GPU code on the CPU by passing
# `parallel = KernelAbstractions.CPU()`, even though `system_coords isa Array`.
parallel_ = parallel
end

foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points,
Val(parallel))
parallel_)
end

@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points, parallel::Val{true})
@threaded for point in points
@threaded system_coords for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
end

return nothing
end

# When a `KernelAbstractions.Backend` is passed, launch a GPU kernel on this backend
@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points,
backend::KernelAbstractions.Backend)
@threaded backend for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
end

Expand Down
46 changes: 29 additions & 17 deletions src/nhs_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -183,38 +183,45 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra
return neighborhood_search
end

# WARNING! Undocumented, experimental feature:
# By default, determine the parallelization backend from the type of `x`.
# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code
# on this backend. This can be useful to run GPU kernels on the CPU by passing
# `parallelization_backend = KernelAbstractions.CPU()`, even though `x isa Array`.
function update!(neighborhood_search::GridNeighborhoodSearch,
x::AbstractMatrix, y::AbstractMatrix;
points_moving = (true, true))
points_moving = (true, true), parallelization_backend = x)
# The coordinates of the first set of points are irrelevant for this NHS.
# Only update when the second set is moving.
points_moving[2] || return neighborhood_search

update_grid!(neighborhood_search, y)
update_grid!(neighborhood_search, y; parallelization_backend)
end

# Update only with neighbor coordinates
function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS},
y::AbstractMatrix) where {NDIMS}
update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i))
y::AbstractMatrix; parallelization_backend = y) where {NDIMS}
update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i);
parallelization_backend)
end

# Serial and semi-parallel update
# Serial and semi-parallel update.
# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`.
function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any,
SerialUpdate},
GridNeighborhoodSearch{<:Any,
SemiParallelUpdate}},
coords_fun::Function)
coords_fun::Function; parallelization_backend = nothing)
(; cell_list, update_buffer) = neighborhood_search

# Empty each thread's list
@threaded for i in eachindex(update_buffer)
@threaded parallelization_backend for i in eachindex(update_buffer)
emptyat!(update_buffer, i)
end

# Find all cells containing points that now belong to another cell.
# This loop is threaded for `update_strategy == SemiParallelUpdate`.
mark_changed_cells!(neighborhood_search, coords_fun)
# This loop is threaded for `update_strategy == SemiParallelUpdate()`.
mark_changed_cells!(neighborhood_search, coords_fun, parallelization_backend)

# Iterate over all marked cells and move the points into their new cells.
# This is always a serial loop (hence "semi-parallel").
Expand Down Expand Up @@ -252,18 +259,22 @@ end
# See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
@inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
SemiParallelUpdate},
coords_fun::T) where {T}
coords_fun::T, parallelization_backend) where {T}
(; cell_list) = neighborhood_search

# `each_cell_index(cell_list)` might return a `KeySet`, which has to be `collect`ed
# first to be able to be used in a threaded loop. This function takes care of that.
@threaded for cell_index in each_cell_index_threadable(neighborhood_search.cell_list)
@threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
end
end

@inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
SerialUpdate},
coords_fun::T) where {T}
for cell_index in each_cell_index(neighborhood_search.cell_list)
coords_fun::T, _) where {T}
(; cell_list) = neighborhood_search

for cell_index in each_cell_index(cell_list)
mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
end
end
Expand All @@ -285,17 +296,18 @@ end
end
end

# Fully parallel update with atomic push
# Fully parallel update with atomic push.
# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`.
function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate},
coords_fun::Function)
coords_fun::Function; parallelization_backend = nothing)
(; cell_list) = neighborhood_search

# Note that we need two separate loops for adding and removing points.
# `push_cell_atomic!` only guarantees thread-safety when different threads push
# simultaneously, but it does not work when `deleteat_cell!` is called at the same time.

# Add points to new cells
@threaded for cell_index in each_cell_index_threadable(cell_list)
@threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
for point in cell_list[cell_index]
cell_coords_ = cell_coords(coords_fun(point), neighborhood_search)

Expand All @@ -309,7 +321,7 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle
end

# Remove points from old cells
@threaded for cell_index in each_cell_index_threadable(cell_list)
@threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
points = cell_list[cell_index]

# WARNING!!!
Expand Down
95 changes: 58 additions & 37 deletions src/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,53 +23,74 @@ end
end

"""
@threaded for ... end
@threaded x for ... end
Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `x`.
Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange`
but without guarantee that the underlying implementation uses `Threads.@threads`
or works for more general `for` loops.
In particular, there may be an additional check whether only one thread is used
to reduce the overhead of serial execution or the underlying threading capabilities
might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
The first argument must either be a `KernelAbstractions.Backend` or an array from which the
backend can be derived to determine if the loop must be run threaded on the CPU
or launched as a kernel on the GPU. Passing `KernelAbstractions.CPU()` will run the GPU
kernel on the CPU.
In particular, the underlying threading capabilities might be provided by other packages
such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
!!! warn
This macro does not necessarily work for general `for` loops. For example,
it does not necessarily support general iterables such as `eachline(filename)`.
Some discussion can be found at
[https://discourse.julialang.org/t/overhead-of-threads-threads/53964](https://discourse.julialang.org/t/overhead-of-threads-threads/53964)
and
[https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435](https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435).
Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl).
"""
macro threaded(expr)
# Use `esc(quote ... end)` for nested macro calls as suggested in
# https://github.com/JuliaLang/julia/issues/23221
#
# The following code is a simple version using only `Threads.@threads` from the
# standard library with an additional check whether only a single thread is used
# to reduce some overhead (and allocations) for serial execution.
#
# return esc(quote
# let
# if Threads.nthreads() == 1
# $(expr)
# else
# Threads.@threads $(expr)
# end
# end
# end)
#
# However, the code below using `@batch` from Polyester.jl is more efficient,
# since this packages provides threads with less overhead. Since it is written
# by Chris Elrod, the author of LoopVectorization.jl, we expect this package
# to provide the most efficient and useful implementation of threads (as we use
# them) available in Julia.
# !!! danger "Heisenbug"
# Look at the comments for `wrap_array` when considering to change this macro.
macro threaded(system, expr)
# Reverse-engineer the for loop.
# `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`.
# So, `expr.args[1].args[2]` is the iterator `eachindex(x)`
# and `expr.args[1].args[1]` is the loop variable `i`.
iterator = expr.args[1].args[2]
i = expr.args[1].args[1]
inner_loop = expr.args[2]

# Assemble the `for` loop again as a call to `parallel_foreach`, using `$i` to use the
# same loop variable as used in the for loop.
return esc(quote
PointNeighbors.@batch $(expr)
PointNeighbors.parallel_foreach($iterator, $system) do $i
$inner_loop
end
end)
end

# Use `Polyester.@batch` for low-overhead threading
@inline function parallel_foreach(f, iterator, x)
Polyester.@batch for i in iterator
@inline f(i)
end
end

# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
@inline function parallel_foreach(f, iterator,
x::Union{AbstractGPUArray, KernelAbstractions.Backend})
# On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
# and index with `iterator[eachindex(iterator)[i]]`.
# Note that this only works with vector-like iterators that support arbitrary indexing.
indices = eachindex(iterator)
ndrange = length(indices)

# Skip empty loops
ndrange == 0 && return

backend = KernelAbstractions.get_backend(x)

# Call the generic kernel that is defined below, which only calls a function with
# the global GPU index.
generic_kernel(backend)(ndrange = ndrange) do i
@inline f(iterator[indices[i]])
end

KernelAbstractions.synchronize(backend)
end

@kernel function generic_kernel(f)
i = @index(Global)
@inline f(i)
end

0 comments on commit f407b7d

Please sign in to comment.