Skip to content

Commit

Permalink
Work around kokkos issue 7036
Browse files Browse the repository at this point in the history
sort_by_key on host backends, with nvcc as compiler would produce build
errors. The fix is present in Kokkos >= 40400 (4.4.0 and up, including
the current develop branch). So for older versions, if building for CUDA
then disable bulk sorting code paths that use sort_by_key.
  • Loading branch information
brian-kelley committed Aug 6, 2024
1 parent 01db3a1 commit dc2929b
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 12 deletions.
26 changes: 18 additions & 8 deletions sparse/impl/KokkosSparse_sort_crs_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
#include "Kokkos_Core.hpp"
#include "Kokkos_Sort.hpp"

// Workaround for issue with Kokkos::Experimental::sort_by_key, with nvcc and OpenMP enabled
// (Kokkos issue #7036, fixed in 4.4 release)
// Once support for Kokkos < 4.4 is dropped,
// all code inside "ifdef KK_DISABLE_BULK_SORT_BY_KEY" can be deleted.
#if (KOKKOS_VERSION < 40400) && defined(KOKKOS_ENABLE_CUDA)
#define KK_DISABLE_BULK_SORT_BY_KEY
#endif

namespace KokkosSparse {
namespace Impl {

Expand Down Expand Up @@ -244,6 +252,7 @@ Kokkos::View<uint64_t*, ExecSpace> generateBulkCrsKeys(const ExecSpace& exec, co
return keys;
}

#ifndef KK_DISABLE_BULK_SORT_BY_KEY
template <typename ExecSpace, typename Rowmap, typename Entries>
Kokkos::View<typename Rowmap::non_const_value_type*, ExecSpace> computeEntryPermutation(
const ExecSpace& exec, const Rowmap& rowmap, const Entries& entries, typename Entries::non_const_value_type ncols) {
Expand All @@ -258,6 +267,15 @@ Kokkos::View<typename Rowmap::non_const_value_type*, ExecSpace> computeEntryPerm
return permutation;
}

// Heuristic for choosing bulk sorting algorithm
template <typename Ordinal>
bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) {
// Use bulk sort if matrix is highly imbalanced,
// OR the longest rows have many entries.
return (maxDeg / 10 > avgDeg) || (maxDeg > 1024);
}
#endif

template <typename ExecSpace, typename Permutation, typename InView, typename OutView>
void applyPermutation(const ExecSpace& exec, const Permutation& permutation, const InView& in, const OutView& out) {
Kokkos::parallel_for(
Expand All @@ -281,14 +299,6 @@ void applyPermutationBlockValues(const ExecSpace& exec, const Permutation& permu
});
}

// Heuristic for choosing bulk sorting algorithm
template <typename Ordinal>
bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) {
// Use bulk sort if matrix is highly imbalanced,
// OR the longest rows have many entries.
return (maxDeg / 10 > avgDeg) || (maxDeg > 1024);
}

} // namespace Impl
} // namespace KokkosSparse

Expand Down
39 changes: 35 additions & 4 deletions sparse/src/KokkosSparse_SortCrs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const
// If the matrix is highly imbalanced, or has long rows AND the dimensions
// are not too large to do one large bulk sort, do that. Otherwise, sort
// using one Kokkos thread per row.
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
#ifndef KK_DISABLE_BULK_SORT_BY_KEY
Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap);
bool useBulkSort = false;
if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) {
Expand All @@ -113,7 +114,11 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const
Kokkos::deep_copy(exec, origEntries, entries);
KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, entries);
KokkosSparse::Impl::applyPermutation(exec, permutation, origValues, values);
} else {
} else
#else
(void)numCols;
#endif
{
using TeamPol = Kokkos::TeamPolicy<execution_space>;
// Can't use bulk sort approach as matrix dimensions are too large.
// Fall back to parallel thread-level sort within each row.
Expand Down Expand Up @@ -179,7 +184,28 @@ void sort_bsr_matrix(const execution_space& exec, Ordinal blockSize, const rowma
throw std::invalid_argument(
"sort_bsr_matrix: implementation requires that numRows * numCols is "
"representable in uint64_t");
#ifdef KK_DISABLE_BULK_SORT_BY_KEY
using TeamPol = Kokkos::TeamPolicy<execution_space>;
using Offset = typename rowmap_t::non_const_value_type;
// Temporary workaround: do not use Kokkos::Experimental::sort_by_key, instead
// sort bulk keys one row at a time
auto keys = Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols);
Kokkos::View<Offset*, execution_space> permutation(Kokkos::view_alloc(Kokkos::WithoutInitializing, "permutation"),
entries.extent(0));
Ordinal vectorLength = 1;
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
while (vectorLength < avgDeg / 2) {
vectorLength *= 2;
}
if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max();
Impl::MatrixSortThreadFunctor<TeamPol, Ordinal, rowmap_t, entries_t, decltype(permutation)> funct(
numRows, rowmap, entries, permutation);
Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag());
Kokkos::parallel_for("sort_bulk_keys_by_row[GPU,bitonic]",
TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct);
#else
auto permutation = KokkosSparse::Impl::computeEntryPermutation(exec, rowmap, entries, numCols);
#endif
// Permutations cannot be done in-place
Kokkos::View<typename values_t::value_type*, execution_space> origValues(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), values.extent(0));
Expand Down Expand Up @@ -254,7 +280,8 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e
// If the graph is highly imbalanced AND the dimensions are not too large
// to do one large bulk sort, do that. Otherwise, sort using one Kokkos
// thread per row.
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
#ifndef KK_DISABLE_BULK_SORT_BY_KEY
Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap);
bool useBulkSort = false;
if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) {
Expand All @@ -269,7 +296,11 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e
if (useBulkSort) {
auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols);
Kokkos::Experimental::sort_by_key(exec, keys, entries);
} else {
} else
#else
(void)numCols;
#endif
{
using TeamPol = Kokkos::TeamPolicy<execution_space>;
// Fall back to thread-level sort within each row
Ordinal vectorLength = 1;
Expand Down

0 comments on commit dc2929b

Please sign in to comment.