Skip to content

Commit 0a6a112

Browse files
authored
Deprecate redundant team-level sort functions (#2306)
* Deprecate redundant team-level sort functions These were moved into Kokkos core a long time ago with a nicer interface and better testing. Replace our implementations with calls to the Kokkos functions like Kokkos::Experimental::sort_team. * Formatting Signed-off-by: Brian Kelley <bmkelle@sandia.gov> * Use our own DefaultComparator, not use Kokkos impl --------- Signed-off-by: Brian Kelley <bmkelle@sandia.gov>
1 parent eca90cf commit 0a6a112

File tree

5 files changed

+23
-320
lines changed

5 files changed

+23
-320
lines changed

common/src/KokkosKernels_Sorting.hpp

Lines changed: 17 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#define _KOKKOSKERNELS_SORTING_HPP
1818

1919
#include "Kokkos_Core.hpp"
20+
#include "Kokkos_Sort.hpp"
2021
#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum
2122
#include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space
2223
#include <type_traits>
@@ -59,30 +60,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value
5960
// Team-level parallel sorting (callable inside any TeamPolicy kernel)
6061
// -------------------------------------------------------------------
6162

62-
// Comparison based sorting that uses the entire team (described by mem) to sort
63-
// raw array according to the comparator.
64-
template <typename Ordinal, typename ValueType, typename TeamMember,
65-
typename Comparator = Impl::DefaultComparator<ValueType>>
66-
KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
67-
const Comparator& comp = Comparator());
68-
69-
// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
70-
// values[0...n].
71-
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember,
72-
typename Comparator = Impl::DefaultComparator<ValueType>>
73-
KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
74-
const Comparator& comp = Comparator());
75-
7663
namespace Impl {
7764

7865
// Functor that sorts a view on one team
7966
template <typename View, typename Ordinal, typename TeamMember, typename Comparator>
8067
struct BitonicSingleTeamFunctor {
8168
BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {}
82-
KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
83-
KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data(), v.extent(0), t,
84-
comp);
85-
};
69+
KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Kokkos::Experimental::sort_team(t, v, comp); };
8670
View v;
8771
Comparator comp;
8872
};
@@ -97,8 +81,7 @@ struct BitonicChunkFunctor {
9781
Ordinal chunkStart = chunk * chunkSize;
9882
Ordinal n = chunkSize;
9983
if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart;
100-
KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data() + chunkStart, n,
101-
t, comp);
84+
Kokkos::Experimental::sort_team(t, Kokkos::subview(v, Kokkos::make_pair(chunkStart, chunkStart + n)), comp);
10285
};
10386
View v;
10487
Comparator comp;
@@ -217,10 +200,11 @@ void bitonicSort(View v, const Comparator& comp) {
217200
Ordinal npot = 1;
218201
while (npot < n) npot <<= 1;
219202
// Partition the data equally among fixed number of teams
220-
Ordinal chunkSize = 512;
221-
Ordinal numTeams = npot / chunkSize;
203+
Ordinal chunkSize = 512;
204+
Ordinal numTeamsChunkSort = (n + chunkSize - 1) / chunkSize;
205+
Ordinal numTeams = npot / chunkSize;
222206
// First, sort within teams
223-
Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
207+
Kokkos::parallel_for(team_policy(numTeamsChunkSort, Kokkos::AUTO()),
224208
Impl::BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(v, comp, chunkSize));
225209
for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) {
226210
Ordinal boxSize = teamsPerBox * chunkSize;
@@ -388,165 +372,23 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value
388372
// trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and
389373
// memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs
390374
// Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter
391-
template <typename Ordinal, typename ValueType, typename TeamMember, typename Comparator>
392-
KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
393-
const Comparator& comp) {
394-
// Algorithm only works on power-of-two input size only.
395-
// If n is not a power-of-two, will implicitly pretend
396-
// that values[i] for i >= n is just the max for ValueType, so it never gets
397-
// swapped
398-
Ordinal npot = 1;
399-
Ordinal levels = 0;
400-
while (npot < n) {
401-
levels++;
402-
npot <<= 1;
403-
}
404-
for (Ordinal i = 0; i < levels; i++) {
405-
for (Ordinal j = 0; j <= i; j++) {
406-
// n/2 pairs of items are compared in parallel
407-
Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
408-
// How big are the brown/pink boxes?
409-
Ordinal boxSize = Ordinal(2) << (i - j);
410-
// Which box contains this thread?
411-
Ordinal boxID = t >> (i - j); // t * 2 / boxSize;
412-
Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize
413-
Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize /
414-
// 2;
415-
Ordinal elem1 = boxStart + boxOffset;
416-
if (j == 0) {
417-
// first phase (brown box): within a block, compare with the
418-
// opposite value in the box
419-
Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
420-
if (elem2 < n) {
421-
// both elements in bounds, so compare them and swap if out of
422-
// order
423-
if (comp(values[elem2], values[elem1])) {
424-
ValueType temp = values[elem1];
425-
values[elem1] = values[elem2];
426-
values[elem2] = temp;
427-
}
428-
}
429-
} else {
430-
// later phases (pink box): within a block, compare with fixed
431-
// distance (boxSize / 2) apart
432-
Ordinal elem2 = elem1 + boxSize / 2;
433-
if (elem2 < n) {
434-
if (comp(values[elem2], values[elem1])) {
435-
ValueType temp = values[elem1];
436-
values[elem1] = values[elem2];
437-
values[elem2] = temp;
438-
}
439-
}
440-
}
441-
});
442-
mem.team_barrier();
443-
}
444-
}
445-
}
446-
447-
// Sort "values", while applying the same swaps to "perm"
448-
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator>
449-
KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
450-
const Comparator& comp) {
451-
// Algorithm only works on power-of-two input size only.
452-
// If n is not a power-of-two, will implicitly pretend
453-
// that values[i] for i >= n is just the max for ValueType, so it never gets
454-
// swapped
455-
Ordinal npot = 1;
456-
Ordinal levels = 0;
457-
while (npot < n) {
458-
levels++;
459-
npot <<= 1;
460-
}
461-
for (Ordinal i = 0; i < levels; i++) {
462-
for (Ordinal j = 0; j <= i; j++) {
463-
// n/2 pairs of items are compared in parallel
464-
Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
465-
// How big are the brown/pink boxes?
466-
Ordinal boxSize = Ordinal(2) << (i - j);
467-
// Which box contains this thread?
468-
Ordinal boxID = t >> (i - j); // t * 2 / boxSize;
469-
Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize
470-
Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize /
471-
// 2;
472-
Ordinal elem1 = boxStart + boxOffset;
473-
if (j == 0) {
474-
// first phase (brown box): within a block, compare with the
475-
// opposite value in the box
476-
Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
477-
if (elem2 < n) {
478-
// both elements in bounds, so compare them and swap if out of
479-
// order
480-
if (comp(values[elem2], values[elem1])) {
481-
ValueType temp1 = values[elem1];
482-
values[elem1] = values[elem2];
483-
values[elem2] = temp1;
484-
PermType temp2 = perm[elem1];
485-
perm[elem1] = perm[elem2];
486-
perm[elem2] = temp2;
487-
}
488-
}
489-
} else {
490-
// later phases (pink box): within a block, compare with fixed
491-
// distance (boxSize / 2) apart
492-
Ordinal elem2 = elem1 + boxSize / 2;
493-
if (elem2 < n) {
494-
if (comp(values[elem2], values[elem1])) {
495-
ValueType temp1 = values[elem1];
496-
values[elem1] = values[elem2];
497-
values[elem2] = temp1;
498-
PermType temp2 = perm[elem1];
499-
perm[elem1] = perm[elem2];
500-
perm[elem2] = temp2;
501-
}
502-
}
503-
}
504-
});
505-
mem.team_barrier();
506-
}
507-
}
508-
}
509-
510-
// For backward compatibility: keep the public interface accessible in
511-
// KokkosKernels::Impl::
512-
namespace Impl {
513-
514-
template <typename View, typename ExecSpace, typename Ordinal,
515-
typename Comparator = Impl::DefaultComparator<typename View::value_type>>
516-
[[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) {
517-
KokkosKernels::bitonicSort<View, ExecSpace, Ordinal, Comparator>(v, comp);
518-
}
519-
520-
template <typename Ordinal, typename ValueType>
521-
[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) {
522-
KokkosKernels::SerialRadixSort<Ordinal, ValueType>(values, valuesAux, n);
523-
}
524-
525-
// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
526-
// values[0...n].
527-
template <typename Ordinal, typename ValueType, typename PermType>
528-
[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm,
529-
PermType* permAux, Ordinal n) {
530-
KokkosKernels::SerialRadixSort2<Ordinal, ValueType, PermType>(values, valuesAux, perm, permAux, n);
531-
}
532-
533375
template <typename Ordinal, typename ValueType, typename TeamMember,
534376
typename Comparator = Impl::DefaultComparator<ValueType>>
535-
[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
536-
const Comparator& comp = Comparator()) {
537-
KokkosKernels::TeamBitonicSort<Ordinal, ValueType, TeamMember, Comparator>(values, n, mem, comp);
377+
[[deprecated("Use Kokkos::Experimental::sort_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(
378+
ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) {
379+
Kokkos::View<ValueType*, Kokkos::AnonymousSpace> valuesView(values, n);
380+
Kokkos::Experimental::sort_team(mem, valuesView, comp);
538381
}
539382

540-
// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
541-
// values[0...n].
383+
// Sort "values", while applying the same swaps to "perm"
542384
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember,
543385
typename Comparator = Impl::DefaultComparator<ValueType>>
544-
[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n,
545-
const TeamMember mem,
546-
const Comparator& comp = Comparator()) {
547-
KokkosKernels::TeamBitonicSort2<Ordinal, ValueType, PermType, TeamMember, Comparator>(values, perm, n, mem, comp);
386+
[[deprecated("Use Kokkos::Experimental::sort_by_key_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(
387+
ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) {
388+
Kokkos::View<ValueType*, Kokkos::AnonymousSpace> valuesView(values, n);
389+
Kokkos::View<PermType*, Kokkos::AnonymousSpace> permView(perm, n);
390+
Kokkos::Experimental::sort_by_key_team(mem, valuesView, permView, comp);
548391
}
549-
} // namespace Impl
550392

551393
} // namespace KokkosKernels
552394

common/unit_test/Test_Common_Sorting.hpp

Lines changed: 0 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -248,125 +248,6 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) {
248248
}
249249
}
250250

251-
template <typename ValView, typename OrdView>
252-
struct TestTeamBitonicFunctor {
253-
typedef typename ValView::value_type Value;
254-
255-
TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_)
256-
: values(values_), counts(counts_), offsets(offsets_) {}
257-
258-
template <typename TeamMem>
259-
KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
260-
int i = t.league_rank();
261-
KokkosKernels::TeamBitonicSort<int, Value, TeamMem>(values.data() + offsets(i), counts(i), t);
262-
}
263-
264-
ValView values;
265-
OrdView counts;
266-
OrdView offsets;
267-
};
268-
269-
template <typename KeyView, typename ValView, typename OrdView>
270-
struct TestTeamBitonic2Functor {
271-
typedef typename KeyView::value_type Key;
272-
typedef typename ValView::value_type Value;
273-
274-
TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_)
275-
: keys(keys_), values(values_), counts(counts_), offsets(offsets_) {}
276-
277-
template <typename TeamMem>
278-
KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
279-
int i = t.league_rank();
280-
KokkosKernels::TeamBitonicSort2<int, Key, Value, TeamMem>(keys.data() + offsets(i), values.data() + offsets(i),
281-
counts(i), t);
282-
}
283-
284-
KeyView keys;
285-
ValView values;
286-
OrdView counts;
287-
OrdView offsets;
288-
};
289-
290-
template <typename Device, typename Scalar>
291-
void testTeamBitonicSort(size_t k, size_t subArraySize) {
292-
// Create a view of randomized data
293-
typedef typename Device::execution_space exec_space;
294-
typedef typename Device::memory_space mem_space;
295-
typedef Kokkos::View<int*, mem_space> OrdView;
296-
typedef Kokkos::View<Scalar*, mem_space> ValView;
297-
OrdView counts("Subarray Sizes", k);
298-
OrdView offsets("Subarray Offsets", k);
299-
// Generate k sub-array sizes, each with size about 20
300-
size_t n = generateRandomOffsets<OrdView, exec_space>(counts, offsets, k, subArraySize);
301-
ValView data("Bitonic sort testing data", n);
302-
fillRandom(data);
303-
Kokkos::View<Scalar*, Kokkos::HostSpace> gold("Host sorted", n);
304-
Kokkos::deep_copy(gold, data);
305-
// Run the sorting on device in all sub-arrays in parallel
306-
Kokkos::parallel_for(Kokkos::TeamPolicy<exec_space>(k, Kokkos::AUTO()),
307-
TestTeamBitonicFunctor<ValView, OrdView>(data, counts, offsets));
308-
// Copy result to host
309-
auto dataHost = Kokkos::create_mirror_view(data);
310-
Kokkos::deep_copy(dataHost, data);
311-
// Sort using std::sort on host to do correctness test
312-
exec_space().fence();
313-
auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
314-
auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
315-
for (size_t i = 0; i < k; i++) {
316-
Scalar* begin = gold.data() + offsetsHost(i);
317-
Scalar* end = begin + countsHost(i);
318-
std::sort(begin, end);
319-
}
320-
for (size_t i = 0; i < n; i++) {
321-
ASSERT_EQ(dataHost(i), gold(i));
322-
}
323-
}
324-
325-
template <typename Device, typename Key, typename Value>
326-
void testTeamBitonicSort2(size_t k, size_t subArraySize) {
327-
// Create a view of randomized data
328-
typedef typename Device::execution_space exec_space;
329-
typedef typename Device::memory_space mem_space;
330-
typedef Kokkos::View<int*, mem_space> OrdView;
331-
typedef Kokkos::View<Key*, mem_space> KeyView;
332-
typedef Kokkos::View<Value*, mem_space> ValView;
333-
OrdView counts("Subarray Sizes", k);
334-
OrdView offsets("Subarray Offsets", k);
335-
// Generate k sub-array sizes, each with size about 20
336-
size_t n = generateRandomOffsets<OrdView, exec_space>(counts, offsets, k, subArraySize);
337-
KeyView keys("Bitonic test keys", n);
338-
ValView data("Bitonic test data", n);
339-
// The keys are randomized
340-
fillRandom(keys, data);
341-
Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
342-
Kokkos::deep_copy(gold, keys);
343-
// Run the sorting on device in all sub-arrays in parallel, just using vector
344-
// loops Deliberately using a weird number for vector length
345-
Kokkos::parallel_for(Kokkos::TeamPolicy<exec_space>(k, Kokkos::AUTO()),
346-
TestTeamBitonic2Functor<KeyView, ValView, OrdView>(keys, data, counts, offsets));
347-
exec_space().fence();
348-
auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
349-
auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
350-
// Sort using std::sort on host to do correctness test
351-
for (size_t i = 0; i < k; i++) {
352-
Key* begin = gold.data() + offsetsHost(i);
353-
Key* end = begin + countsHost(i);
354-
std::sort(begin, end);
355-
}
356-
// Copy results to host
357-
auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
358-
auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
359-
// Make sure keys are sorted exactly (stability of sort doesn't matter)
360-
for (size_t i = 0; i < n; i++) {
361-
ASSERT_EQ(keysHost(i), gold(i));
362-
}
363-
// Make sure the hashes of each key still matches the corresponding value
364-
for (size_t i = 0; i < n; i++) {
365-
auto correctHash = kvHash<Key, Value>()(keysHost(i));
366-
ASSERT_EQ(dataHost(i), correctHash);
367-
}
368-
}
369-
370251
template <typename View>
371252
struct CheckSortedFunctor {
372253
CheckSortedFunctor(View& v_) : v(v_) {}
@@ -480,27 +361,6 @@ TEST_F(TestCategory, common_serial_radix2) {
480361
}
481362
}
482363

483-
TEST_F(TestCategory, common_team_bitonic) {
484-
// Test team-level bitonic over some contiguous medium arrays
485-
// 1st arg is #arrays, 2nd arg is max subarray size
486-
size_t numArrays = 20;
487-
for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
488-
testTeamBitonicSort<TestDevice, char>(numArrays, arrayMax);
489-
testTeamBitonicSort<TestDevice, int>(numArrays, arrayMax);
490-
}
491-
}
492-
493-
TEST_F(TestCategory, common_team_bitonic2) {
494-
// Test team-level bitonic over some contiguous medium arrays
495-
// 1st arg is #arrays, 2nd arg is max subarray size
496-
size_t numArrays = 20;
497-
for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
498-
testTeamBitonicSort2<TestDevice, char, int>(numArrays, arrayMax);
499-
testTeamBitonicSort2<TestDevice, int, double>(numArrays, arrayMax);
500-
testTeamBitonicSort2<TestDevice, int, Kokkos::complex<double>>(numArrays, arrayMax);
501-
}
502-
}
503-
504364
TEST_F(TestCategory, common_device_bitonic) {
505365
// Test device-level bitonic with some larger arrays
506366
testBitonicSort<TestDevice, char>(243743);

0 commit comments

Comments
 (0)