Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate redundant team-level sort functions #2306

Merged
merged 3 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 17 additions & 175 deletions common/src/KokkosKernels_Sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define _KOKKOSKERNELS_SORTING_HPP

#include "Kokkos_Core.hpp"
#include "Kokkos_Sort.hpp"
#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum
#include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space
#include <type_traits>
Expand Down Expand Up @@ -59,30 +60,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value
// Team-level parallel sorting (callable inside any TeamPolicy kernel)
// -------------------------------------------------------------------

// Comparison based sorting that uses the entire team (described by mem) to sort
// raw array according to the comparator.
template <typename Ordinal, typename ValueType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
const Comparator& comp = Comparator());

// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
// values[0...n].
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
const Comparator& comp = Comparator());

namespace Impl {

// Functor that sorts a view on one team
template <typename View, typename Ordinal, typename TeamMember, typename Comparator>
struct BitonicSingleTeamFunctor {
BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {}
KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data(), v.extent(0), t,
comp);
};
KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Kokkos::Experimental::sort_team(t, v, comp); };
View v;
Comparator comp;
};
Expand All @@ -97,8 +81,7 @@ struct BitonicChunkFunctor {
Ordinal chunkStart = chunk * chunkSize;
Ordinal n = chunkSize;
if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart;
KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data() + chunkStart, n,
t, comp);
Kokkos::Experimental::sort_team(t, Kokkos::subview(v, Kokkos::make_pair(chunkStart, chunkStart + n)), comp);
};
View v;
Comparator comp;
Expand Down Expand Up @@ -217,10 +200,11 @@ void bitonicSort(View v, const Comparator& comp) {
Ordinal npot = 1;
while (npot < n) npot <<= 1;
// Partition the data equally among fixed number of teams
Ordinal chunkSize = 512;
Ordinal numTeams = npot / chunkSize;
Ordinal chunkSize = 512;
Ordinal numTeamsChunkSort = (n + chunkSize - 1) / chunkSize;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is because the Kokkos sort_team exposed a bug where too many teams ran this functor, so chunks on some teams had negative length.

Ordinal numTeams = npot / chunkSize;
// First, sort within teams
Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
Kokkos::parallel_for(team_policy(numTeamsChunkSort, Kokkos::AUTO()),
Impl::BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(v, comp, chunkSize));
for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) {
Ordinal boxSize = teamsPerBox * chunkSize;
Expand Down Expand Up @@ -388,165 +372,23 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value
// trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and
// memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs
// Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter
template <typename Ordinal, typename ValueType, typename TeamMember, typename Comparator>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
const Comparator& comp) {
// Algorithm only works on power-of-two input size only.
// If n is not a power-of-two, will implicitly pretend
// that values[i] for i >= n is just the max for ValueType, so it never gets
// swapped
Ordinal npot = 1;
Ordinal levels = 0;
while (npot < n) {
levels++;
npot <<= 1;
}
for (Ordinal i = 0; i < levels; i++) {
for (Ordinal j = 0; j <= i; j++) {
// n/2 pairs of items are compared in parallel
Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
// How big are the brown/pink boxes?
Ordinal boxSize = Ordinal(2) << (i - j);
// Which box contains this thread?
Ordinal boxID = t >> (i - j); // t * 2 / boxSize;
Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize
Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize /
// 2;
Ordinal elem1 = boxStart + boxOffset;
if (j == 0) {
// first phase (brown box): within a block, compare with the
// opposite value in the box
Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
if (elem2 < n) {
// both elements in bounds, so compare them and swap if out of
// order
if (comp(values[elem2], values[elem1])) {
ValueType temp = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp;
}
}
} else {
// later phases (pink box): within a block, compare with fixed
// distance (boxSize / 2) apart
Ordinal elem2 = elem1 + boxSize / 2;
if (elem2 < n) {
if (comp(values[elem2], values[elem1])) {
ValueType temp = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp;
}
}
}
});
mem.team_barrier();
}
}
}

// Sort "values", while applying the same swaps to "perm"
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
const Comparator& comp) {
// Algorithm only works on power-of-two input size only.
// If n is not a power-of-two, will implicitly pretend
// that values[i] for i >= n is just the max for ValueType, so it never gets
// swapped
Ordinal npot = 1;
Ordinal levels = 0;
while (npot < n) {
levels++;
npot <<= 1;
}
for (Ordinal i = 0; i < levels; i++) {
for (Ordinal j = 0; j <= i; j++) {
// n/2 pairs of items are compared in parallel
Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
// How big are the brown/pink boxes?
Ordinal boxSize = Ordinal(2) << (i - j);
// Which box contains this thread?
Ordinal boxID = t >> (i - j); // t * 2 / boxSize;
Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize
Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize /
// 2;
Ordinal elem1 = boxStart + boxOffset;
if (j == 0) {
// first phase (brown box): within a block, compare with the
// opposite value in the box
Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
if (elem2 < n) {
// both elements in bounds, so compare them and swap if out of
// order
if (comp(values[elem2], values[elem1])) {
ValueType temp1 = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp1;
PermType temp2 = perm[elem1];
perm[elem1] = perm[elem2];
perm[elem2] = temp2;
}
}
} else {
// later phases (pink box): within a block, compare with fixed
// distance (boxSize / 2) apart
Ordinal elem2 = elem1 + boxSize / 2;
if (elem2 < n) {
if (comp(values[elem2], values[elem1])) {
ValueType temp1 = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp1;
PermType temp2 = perm[elem1];
perm[elem1] = perm[elem2];
perm[elem2] = temp2;
}
}
}
});
mem.team_barrier();
}
}
}

// For backward compatibility: keep the public interface accessible in
// KokkosKernels::Impl::
namespace Impl {

template <typename View, typename ExecSpace, typename Ordinal,
typename Comparator = Impl::DefaultComparator<typename View::value_type>>
[[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) {
KokkosKernels::bitonicSort<View, ExecSpace, Ordinal, Comparator>(v, comp);
}

template <typename Ordinal, typename ValueType>
[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) {
KokkosKernels::SerialRadixSort<Ordinal, ValueType>(values, valuesAux, n);
}

// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
// values[0...n].
template <typename Ordinal, typename ValueType, typename PermType>
[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm,
PermType* permAux, Ordinal n) {
KokkosKernels::SerialRadixSort2<Ordinal, ValueType, PermType>(values, valuesAux, perm, permAux, n);
}

template <typename Ordinal, typename ValueType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
const Comparator& comp = Comparator()) {
KokkosKernels::TeamBitonicSort<Ordinal, ValueType, TeamMember, Comparator>(values, n, mem, comp);
[[deprecated("Use Kokkos::Experimental::sort_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(
ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) {
Kokkos::View<ValueType*, Kokkos::AnonymousSpace> valuesView(values, n);
Kokkos::Experimental::sort_team(mem, valuesView, comp);
}

// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
// values[0...n].
// Sort "values", while applying the same swaps to "perm"
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n,
const TeamMember mem,
const Comparator& comp = Comparator()) {
KokkosKernels::TeamBitonicSort2<Ordinal, ValueType, PermType, TeamMember, Comparator>(values, perm, n, mem, comp);
[[deprecated("Use Kokkos::Experimental::sort_by_key_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(
ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) {
Kokkos::View<ValueType*, Kokkos::AnonymousSpace> valuesView(values, n);
Kokkos::View<PermType*, Kokkos::AnonymousSpace> permView(perm, n);
Kokkos::Experimental::sort_by_key_team(mem, valuesView, permView, comp);
}
} // namespace Impl

} // namespace KokkosKernels

Expand Down
140 changes: 0 additions & 140 deletions common/unit_test/Test_Common_Sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,125 +248,6 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) {
}
}

template <typename ValView, typename OrdView>
struct TestTeamBitonicFunctor {
typedef typename ValView::value_type Value;

TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_)
: values(values_), counts(counts_), offsets(offsets_) {}

template <typename TeamMem>
KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
int i = t.league_rank();
KokkosKernels::TeamBitonicSort<int, Value, TeamMem>(values.data() + offsets(i), counts(i), t);
}

ValView values;
OrdView counts;
OrdView offsets;
};

template <typename KeyView, typename ValView, typename OrdView>
struct TestTeamBitonic2Functor {
typedef typename KeyView::value_type Key;
typedef typename ValView::value_type Value;

TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_)
: keys(keys_), values(values_), counts(counts_), offsets(offsets_) {}

template <typename TeamMem>
KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
int i = t.league_rank();
KokkosKernels::TeamBitonicSort2<int, Key, Value, TeamMem>(keys.data() + offsets(i), values.data() + offsets(i),
counts(i), t);
}

KeyView keys;
ValView values;
OrdView counts;
OrdView offsets;
};

template <typename Device, typename Scalar>
void testTeamBitonicSort(size_t k, size_t subArraySize) {
// Create a view of randomized data
typedef typename Device::execution_space exec_space;
typedef typename Device::memory_space mem_space;
typedef Kokkos::View<int*, mem_space> OrdView;
typedef Kokkos::View<Scalar*, mem_space> ValView;
OrdView counts("Subarray Sizes", k);
OrdView offsets("Subarray Offsets", k);
// Generate k sub-array sizes, each with size about 20
size_t n = generateRandomOffsets<OrdView, exec_space>(counts, offsets, k, subArraySize);
ValView data("Bitonic sort testing data", n);
fillRandom(data);
Kokkos::View<Scalar*, Kokkos::HostSpace> gold("Host sorted", n);
Kokkos::deep_copy(gold, data);
// Run the sorting on device in all sub-arrays in parallel
Kokkos::parallel_for(Kokkos::TeamPolicy<exec_space>(k, Kokkos::AUTO()),
TestTeamBitonicFunctor<ValView, OrdView>(data, counts, offsets));
// Copy result to host
auto dataHost = Kokkos::create_mirror_view(data);
Kokkos::deep_copy(dataHost, data);
// Sort using std::sort on host to do correctness test
exec_space().fence();
auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
for (size_t i = 0; i < k; i++) {
Scalar* begin = gold.data() + offsetsHost(i);
Scalar* end = begin + countsHost(i);
std::sort(begin, end);
}
for (size_t i = 0; i < n; i++) {
ASSERT_EQ(dataHost(i), gold(i));
}
}

template <typename Device, typename Key, typename Value>
void testTeamBitonicSort2(size_t k, size_t subArraySize) {
// Create a view of randomized data
typedef typename Device::execution_space exec_space;
typedef typename Device::memory_space mem_space;
typedef Kokkos::View<int*, mem_space> OrdView;
typedef Kokkos::View<Key*, mem_space> KeyView;
typedef Kokkos::View<Value*, mem_space> ValView;
OrdView counts("Subarray Sizes", k);
OrdView offsets("Subarray Offsets", k);
// Generate k sub-array sizes, each with size about 20
size_t n = generateRandomOffsets<OrdView, exec_space>(counts, offsets, k, subArraySize);
KeyView keys("Bitonic test keys", n);
ValView data("Bitonic test data", n);
// The keys are randomized
fillRandom(keys, data);
Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
Kokkos::deep_copy(gold, keys);
// Run the sorting on device in all sub-arrays in parallel, just using vector
// loops Deliberately using a weird number for vector length
Kokkos::parallel_for(Kokkos::TeamPolicy<exec_space>(k, Kokkos::AUTO()),
TestTeamBitonic2Functor<KeyView, ValView, OrdView>(keys, data, counts, offsets));
exec_space().fence();
auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
// Sort using std::sort on host to do correctness test
for (size_t i = 0; i < k; i++) {
Key* begin = gold.data() + offsetsHost(i);
Key* end = begin + countsHost(i);
std::sort(begin, end);
}
// Copy results to host
auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
// Make sure keys are sorted exactly (stability of sort doesn't matter)
for (size_t i = 0; i < n; i++) {
ASSERT_EQ(keysHost(i), gold(i));
}
// Make sure the hashes of each key still matches the corresponding value
for (size_t i = 0; i < n; i++) {
auto correctHash = kvHash<Key, Value>()(keysHost(i));
ASSERT_EQ(dataHost(i), correctHash);
}
}

template <typename View>
struct CheckSortedFunctor {
CheckSortedFunctor(View& v_) : v(v_) {}
Expand Down Expand Up @@ -480,27 +361,6 @@ TEST_F(TestCategory, common_serial_radix2) {
}
}

TEST_F(TestCategory, common_team_bitonic) {
// Test team-level bitonic over some contiguous medium arrays
// 1st arg is #arrays, 2nd arg is max subarray size
size_t numArrays = 20;
for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
testTeamBitonicSort<TestDevice, char>(numArrays, arrayMax);
testTeamBitonicSort<TestDevice, int>(numArrays, arrayMax);
}
}

TEST_F(TestCategory, common_team_bitonic2) {
// Test team-level bitonic over some contiguous medium arrays
// 1st arg is #arrays, 2nd arg is max subarray size
size_t numArrays = 20;
for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
testTeamBitonicSort2<TestDevice, char, int>(numArrays, arrayMax);
testTeamBitonicSort2<TestDevice, int, double>(numArrays, arrayMax);
testTeamBitonicSort2<TestDevice, int, Kokkos::complex<double>>(numArrays, arrayMax);
}
}

TEST_F(TestCategory, common_device_bitonic) {
// Test device-level bitonic with some larger arrays
testBitonicSort<TestDevice, char>(243743);
Expand Down
Loading
Loading