diff --git a/common/src/KokkosKernels_Sorting.hpp b/common/src/KokkosKernels_Sorting.hpp index f91f11c164..1080bb2c7b 100644 --- a/common/src/KokkosKernels_Sorting.hpp +++ b/common/src/KokkosKernels_Sorting.hpp @@ -17,19 +17,13 @@ #define _KOKKOSKERNELS_SORTING_HPP #include "Kokkos_Core.hpp" +#include "Kokkos_Sort.hpp" #include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum #include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space #include namespace KokkosKernels { -namespace Impl { -template -struct DefaultComparator { - KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const { return lhs < rhs; } -}; -} // namespace Impl - // ---------------------------- // General device-level sorting // ---------------------------- @@ -37,7 +31,7 @@ struct DefaultComparator { // Bitonic sort: sorts v according to the comparator object's operator(). // Default comparator is just operator< for v's element type. template > + typename Comparator = Kokkos::Experimental::Impl::StdAlgoLessThanBinaryPredicate> void bitonicSort(View v, const Comparator& comp = Comparator()); // -------------------------------------------------------- @@ -59,30 +53,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value // Team-level parallel sorting (callable inside any TeamPolicy kernel) // ------------------------------------------------------------------- -// Comparison based sorting that uses the entire team (described by mem) to sort -// raw array according to the comparator. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); - -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); - namespace Impl { // Functor that sorts a view on one team template struct BitonicSingleTeamFunctor { BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - KokkosKernels::TeamBitonicSort(v.data(), v.extent(0), t, - comp); - }; + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Kokkos::Experimental::sort_team(t, v, comp); }; View v; Comparator comp; }; @@ -97,8 +74,7 @@ struct BitonicChunkFunctor { Ordinal chunkStart = chunk * chunkSize; Ordinal n = chunkSize; if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - KokkosKernels::TeamBitonicSort(v.data() + chunkStart, n, - t, comp); + Kokkos::Experimental::sort_team(t, Kokkos::subview(v, Kokkos::make_pair(chunkStart, chunkStart + n)), comp); }; View v; Comparator comp; @@ -217,10 +193,11 @@ void bitonicSort(View v, const Comparator& comp) { Ordinal npot = 1; while (npot < n) npot <<= 1; // Partition the data equally among fixed number of teams - Ordinal chunkSize = 512; - Ordinal numTeams = npot / chunkSize; + Ordinal chunkSize = 512; + Ordinal numTeamsChunkSort = (n + chunkSize - 1) / chunkSize; + Ordinal numTeams = npot / chunkSize; // First, sort within teams - Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Kokkos::parallel_for(team_policy(numTeamsChunkSort, Kokkos::AUTO()), Impl::BitonicChunkFunctor(v, comp, chunkSize)); for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) { Ordinal boxSize = teamsPerBox * chunkSize; @@ -388,165 +365,23 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value // trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and // memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs // Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp) { - // Algorithm only works on power-of-two input size only. - // If n is not a power-of-two, will implicitly pretend - // that values[i] for i >= n is just the max for ValueType, so it never gets - // swapped - Ordinal npot = 1; - Ordinal levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - for (Ordinal i = 0; i < levels; i++) { - for (Ordinal j = 0; j <= i; j++) { - // n/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } - }); - mem.team_barrier(); - } - } -} - -// Sort "values", while applying the same swaps to "perm" -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp) { - // Algorithm only works on power-of-two input size only. - // If n is not a power-of-two, will implicitly pretend - // that values[i] for i >= n is just the max for ValueType, so it never gets - // swapped - Ordinal npot = 1; - Ordinal levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - for (Ordinal i = 0; i < levels; i++) { - for (Ordinal j = 0; j <= i; j++) { - // n/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } - }); - mem.team_barrier(); - } - } -} - -// For backward compatibility: keep the public interface accessible in -// KokkosKernels::Impl:: -namespace Impl { - -template > -[[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) { - KokkosKernels::bitonicSort(v, comp); -} - -template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) { - KokkosKernels::SerialRadixSort(values, valuesAux, n); -} - -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. -template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, - PermType* permAux, Ordinal n) { - KokkosKernels::SerialRadixSort2(values, valuesAux, perm, permAux, n); -} - template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort(values, n, mem, comp); + typename Comparator = Kokkos::Experimental::Impl::StdAlgoLessThanBinaryPredicate> +[[deprecated("Use Kokkos::Experimental::sort_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort( + ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { + Kokkos::View valuesView(values, n); + Kokkos::Experimental::sort_team(mem, valuesView, comp); } -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. +// Sort "values", while applying the same swaps to "perm" template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, - const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); + typename Comparator = Kokkos::Experimental::Impl::StdAlgoLessThanBinaryPredicate> +[[deprecated("Use Kokkos::Experimental::sort_by_key_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( + ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { + Kokkos::View valuesView(values, n); + Kokkos::View permView(perm, n); + Kokkos::Experimental::sort_by_key_team(mem, valuesView, permView, comp); } -} // namespace Impl } // namespace KokkosKernels diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index 30623a8691..e4e62e5936 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -248,125 +248,6 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { } } -template -struct TestTeamBitonicFunctor { - typedef typename ValView::value_type Value; - - TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_) - : values(values_), counts(counts_), offsets(offsets_) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - int i = t.league_rank(); - KokkosKernels::TeamBitonicSort(values.data() + offsets(i), counts(i), t); - } - - ValView values; - OrdView counts; - OrdView offsets; -}; - -template -struct TestTeamBitonic2Functor { - typedef typename KeyView::value_type Key; - typedef typename ValView::value_type Value; - - TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_) - : keys(keys_), values(values_), counts(counts_), offsets(offsets_) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - int i = t.league_rank(); - KokkosKernels::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), - counts(i), t); - } - - KeyView keys; - ValView values; - OrdView counts; - OrdView offsets; -}; - -template -void testTeamBitonicSort(size_t k, size_t subArraySize) { - // Create a view of randomized data - typedef typename Device::execution_space exec_space; - typedef typename Device::memory_space mem_space; - typedef Kokkos::View OrdView; - typedef Kokkos::View ValView; - OrdView counts("Subarray Sizes", k); - OrdView offsets("Subarray Offsets", k); - // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - ValView data("Bitonic sort testing data", n); - fillRandom(data); - Kokkos::View gold("Host sorted", n); - Kokkos::deep_copy(gold, data); - // Run the sorting on device in all sub-arrays in parallel - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonicFunctor(data, counts, offsets)); - // Copy result to host - auto dataHost = Kokkos::create_mirror_view(data); - Kokkos::deep_copy(dataHost, data); - // Sort using std::sort on host to do correctness test - exec_space().fence(); - auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - for (size_t i = 0; i < k; i++) { - Scalar* begin = gold.data() + offsetsHost(i); - Scalar* end = begin + countsHost(i); - std::sort(begin, end); - } - for (size_t i = 0; i < n; i++) { - ASSERT_EQ(dataHost(i), gold(i)); - } -} - -template -void testTeamBitonicSort2(size_t k, size_t subArraySize) { - // Create a view of randomized data - typedef typename Device::execution_space exec_space; - typedef typename Device::memory_space mem_space; - typedef Kokkos::View OrdView; - typedef Kokkos::View KeyView; - typedef Kokkos::View ValView; - OrdView counts("Subarray Sizes", k); - OrdView offsets("Subarray Offsets", k); - // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - KeyView keys("Bitonic test keys", n); - ValView data("Bitonic test data", n); - // The keys are randomized - fillRandom(keys, data); - Kokkos::View gold("Host sorted", n); - Kokkos::deep_copy(gold, keys); - // Run the sorting on device in all sub-arrays in parallel, just using vector - // loops Deliberately using a weird number for vector length - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonic2Functor(keys, data, counts, offsets)); - exec_space().fence(); - auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - // Sort using std::sort on host to do correctness test - for (size_t i = 0; i < k; i++) { - Key* begin = gold.data() + offsetsHost(i); - Key* end = begin + countsHost(i); - std::sort(begin, end); - } - // Copy results to host - auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); - auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); - // Make sure keys are sorted exactly (stability of sort doesn't matter) - for (size_t i = 0; i < n; i++) { - ASSERT_EQ(keysHost(i), gold(i)); - } - // Make sure the hashes of each key still matches the corresponding value - for (size_t i = 0; i < n; i++) { - auto correctHash = kvHash()(keysHost(i)); - ASSERT_EQ(dataHost(i), correctHash); - } -} - template struct CheckSortedFunctor { CheckSortedFunctor(View& v_) : v(v_) {} @@ -480,27 +361,6 @@ TEST_F(TestCategory, common_serial_radix2) { } } -TEST_F(TestCategory, common_team_bitonic) { - // Test team-level bitonic over some contiguous medium arrays - // 1st arg is #arrays, 2nd arg is max subarray size - size_t numArrays = 20; - for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort(numArrays, arrayMax); - testTeamBitonicSort(numArrays, arrayMax); - } -} - -TEST_F(TestCategory, common_team_bitonic2) { - // Test team-level bitonic over some contiguous medium arrays - // 1st arg is #arrays, 2nd arg is max subarray size - size_t numArrays = 20; - for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2>(numArrays, arrayMax); - } -} - TEST_F(TestCategory, common_device_bitonic) { // Test device-level bitonic with some larger arrays testBitonicSort(243743); diff --git a/graph/src/KokkosGraph_CoarsenConstruct.hpp b/graph/src/KokkosGraph_CoarsenConstruct.hpp index 8e1cce3ddb..668140d3be 100644 --- a/graph/src/KokkosGraph_CoarsenConstruct.hpp +++ b/graph/src/KokkosGraph_CoarsenConstruct.hpp @@ -73,8 +73,7 @@ struct SortLowDegreeCrsMatrixFunctor { Kokkos::single(Kokkos::PerTeam(t), [&]() { reducer++; }); return; } - KokkosKernels::TeamBitonicSort2(entries.data() + rowStart, - values.data() + rowStart, rowNum, t); + Kokkos::Experimental::sort_by_key_team(t, Kokkos::subview(entries, Kokkos::make_pair(rowStart, rowEnd)), Kokkos::subview(values, Kokkos::make_pair(rowStart, rowEnd))); } rowmap_t rowmap; diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index fa4fa4a54e..70bb313830 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -606,8 +606,7 @@ class PointGaussSeidel { nnz_lno_t color = t.league_rank(); nnz_lno_t colorBegin = color_xadj(color); nnz_lno_t colorLen = color_xadj(color + 1) - colorBegin; - KokkosKernels::TeamBitonicSort(color_adj.data() + colorBegin, colorLen, t, comp); - t.team_barrier(); + Kokkos::Experimental::sort_team(t, Kokkos::subview(color_adj, Kokkos::make_pair(colorBegin, colorBegin + colorLen)), comp); // Now that the color set is sorted, count how many long rows there were nnz_lno_t numLongRows; Kokkos::parallel_reduce( diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 4949887a7d..32ed74c735 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -568,7 +568,7 @@ struct MDF_compute_list_length { team.team_reduce(Kokkos::Sum(updateIdx)); // Sort update list - KokkosKernels::TeamBitonicSort(&update_list(0), updateIdx, team); + Kokkos::Experimental::sort_team(team, Kokkos::subview(update_list, Kokkos::make_pair(0, updateIdx))); } { size_type numEntrU = 0;