diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 586ccd4e6fce6..50da37270e7d3 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -50,6 +50,31 @@ GetMultiPtrDecoratedAs(multi_ptr MPtr) { namespace spirv { +template +struct is_tangle_or_opportunistic_group : std::false_type {}; + +template +struct is_tangle_or_opportunistic_group< + sycl::ext::oneapi::experimental::tangle_group> + : std::true_type {}; + +template <> +struct is_tangle_or_opportunistic_group< + sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {}; + +template struct is_ballot_group : std::false_type {}; + +template +struct is_ballot_group< + sycl::ext::oneapi::experimental::ballot_group> + : std::true_type {}; + +template struct is_fixed_size_group : std::false_type {}; + +template +struct is_fixed_size_group> : std::true_type {}; + template struct group_scope {}; template @@ -260,6 +285,8 @@ template <> struct GroupId<::sycl::ext::oneapi::sub_group> { template <> struct GroupId<::sycl::sub_group> { using type = uint32_t; }; + +// Consolidated function for converting group arguments to OpenCL types. template EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { auto GroupLocalId = static_cast::type>(local_id); @@ -268,6 +295,7 @@ EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { auto OCLId = detail::convertToOpenCLType(GroupLocalId); return __spirv_GroupBroadcast(group_scope::value, WideOCLX, OCLId); } + template EnableIfNativeBroadcast GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, @@ -781,103 +809,222 @@ inline uint32_t membermask() { } #endif +template +inline uint32_t MapShuffleID(GroupT g, id<1> local_id) { + if constexpr (is_tangle_or_opportunistic_group::value || + is_ballot_group::value) + return detail::IdToMaskPosition(g, local_id); + else if constexpr (is_fixed_size_group::value) + return g.get_group_linear_id() * g.get_local_range().size() + local_id; + else + return local_id.get(0); +} + // Forward declarations for template overloadings -template -EnableIfBitcastShuffle SubgroupShuffle(T x, id<1> local_id); +template +EnableIfBitcastShuffle Shuffle(GroupT g, T x, id<1> local_id); -template -EnableIfBitcastShuffle SubgroupShuffleXor(T x, id<1> local_id); +template +EnableIfBitcastShuffle ShuffleXor(GroupT g, T x, id<1> local_id); -template -EnableIfBitcastShuffle SubgroupShuffleDown(T x, uint32_t delta); +template +EnableIfBitcastShuffle ShuffleDown(GroupT g, T x, uint32_t delta); -template -EnableIfBitcastShuffle SubgroupShuffleUp(T x, uint32_t delta); +template +EnableIfBitcastShuffle ShuffleUp(GroupT g, T x, uint32_t delta); -template -EnableIfGenericShuffle SubgroupShuffle(T x, id<1> local_id); +template +EnableIfGenericShuffle Shuffle(GroupT g, T x, id<1> local_id); -template -EnableIfGenericShuffle SubgroupShuffleXor(T x, id<1> local_id); +template +EnableIfGenericShuffle ShuffleXor(GroupT g, T x, id<1> local_id); -template -EnableIfGenericShuffle SubgroupShuffleDown(T x, uint32_t delta); +template +EnableIfGenericShuffle ShuffleDown(GroupT g, T x, uint32_t delta); -template -EnableIfGenericShuffle SubgroupShuffleUp(T x, uint32_t delta); +template +EnableIfGenericShuffle ShuffleUp(GroupT g, T x, uint32_t delta); -template -EnableIfNativeShuffle SubgroupShuffle(T x, id<1> local_id) { +template +EnableIfNativeShuffle Shuffle(GroupT g, T x, id<1> local_id) { + uint32_t LocalId = MapShuffleID(g, local_id); #ifndef __NVPTX__ - return __spirv_SubgroupShuffleINTEL(convertToOpenCLType(x), - static_cast(local_id.get(0))); + std::ignore = g; + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT> && + detail::is_vec::value) { + // Temporary work-around due to a bug in IGC. + // TODO: Remove when IGC bug is fixed. + T result; + for (int s = 0; s < x.size(); ++s) + result[s] = Shuffle(g, x[s], local_id); + return result; + } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + return __spirv_GroupNonUniformShuffle(group_scope::value, + convertToOpenCLType(x), LocalId); + } else { + // Subgroup. + return __spirv_SubgroupShuffleINTEL(convertToOpenCLType(x), LocalId); + } #else - return __nvvm_shfl_sync_idx_i32(membermask(), x, local_id.get(0), 0x1f); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], + x, LocalId, 0x1f); + } else { + return __nvvm_shfl_sync_idx_i32(membermask(), x, LocalId, 0x1f); + } #endif } -template -EnableIfNativeShuffle SubgroupShuffleXor(T x, id<1> local_id) { +template +EnableIfNativeShuffle ShuffleXor(GroupT g, T x, id<1> mask) { #ifndef __NVPTX__ - return __spirv_SubgroupShuffleXorINTEL( - convertToOpenCLType(x), static_cast(local_id.get(0))); + std::ignore = g; + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT> && + detail::is_vec::value) { + // Temporary work-around due to a bug in IGC. + // TODO: Remove when IGC bug is fixed. + T result; + for (int s = 0; s < x.size(); ++s) + result[s] = ShuffleXor(g, x[s], mask); + return result; + } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + // Since the masks are relative to the groups, we could either try to adjust + // the mask or simply do the xor ourselves. Latter option is efficient, + // general, and simple so we go with that. + id<1> TargetLocalId = g.get_local_id() ^ mask; + uint32_t TargetId = MapShuffleID(g, TargetLocalId); + return __spirv_GroupNonUniformShuffle(group_scope::value, + convertToOpenCLType(x), TargetId); + } else { + // Subgroup. + return __spirv_SubgroupShuffleXorINTEL(convertToOpenCLType(x), + static_cast(mask.get(0))); + } #else - return __nvvm_shfl_sync_bfly_i32(membermask(), x, local_id.get(0), 0x1f); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + return __nvvm_shfl_sync_bfly_i32(detail::ExtractMask(detail::GetMask(g))[0], + x, static_cast(mask.get(0)), + 0x1f); + } else { + return __nvvm_shfl_sync_bfly_i32(membermask(), x, + static_cast(mask.get(0)), 0x1f); + } #endif } -template -EnableIfNativeShuffle SubgroupShuffleDown(T x, uint32_t delta) { +template +EnableIfNativeShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { #ifndef __NVPTX__ - return __spirv_SubgroupShuffleDownINTEL(convertToOpenCLType(x), - convertToOpenCLType(x), delta); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT> && + detail::is_vec::value) { + // Temporary work-around due to a bug in IGC. + // TODO: Remove when IGC bug is fixed. + T result; + for (int s = 0; s < x.size(); ++s) + result[s] = ShuffleDown(g, x[s], delta); + return result; + } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + id<1> TargetLocalId = g.get_local_id(); + // ID outside the group range is UB, so we just keep the current item ID + // unchanged. + if (TargetLocalId[0] + delta < g.get_local_linear_range()) + TargetLocalId[0] += delta; + uint32_t TargetId = MapShuffleID(g, TargetLocalId); + return __spirv_GroupNonUniformShuffle(group_scope::value, + convertToOpenCLType(x), TargetId); + } else { + // Subgroup. + return __spirv_SubgroupShuffleDownINTEL(convertToOpenCLType(x), + convertToOpenCLType(x), delta); + } #else - return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + return __nvvm_shfl_sync_down_i32(detail::ExtractMask(detail::GetMask(g))[0], + x, delta, 0x1f); + } else { + return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f); + } #endif } -template -EnableIfNativeShuffle SubgroupShuffleUp(T x, uint32_t delta) { +template +EnableIfNativeShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { #ifndef __NVPTX__ - return __spirv_SubgroupShuffleUpINTEL(convertToOpenCLType(x), - convertToOpenCLType(x), delta); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT> && + detail::is_vec::value) { + // Temporary work-around due to a bug in IGC. + // TODO: Remove when IGC bug is fixed. + T result; + for (int s = 0; s < x.size(); ++s) + result[s] = ShuffleUp(g, x[s], delta); + return result; + } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + id<1> TargetLocalId = g.get_local_id(); + // Underflow is UB, so we just keep the current item ID unchanged. + if (TargetLocalId[0] >= delta) + TargetLocalId[0] -= delta; + uint32_t TargetId = MapShuffleID(g, TargetLocalId); + return __spirv_GroupNonUniformShuffle(group_scope::value, + convertToOpenCLType(x), TargetId); + } else { + // Subgroup. + return __spirv_SubgroupShuffleUpINTEL(convertToOpenCLType(x), + convertToOpenCLType(x), delta); + } #else - return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< + GroupT>) { + return __nvvm_shfl_sync_up_i32(detail::ExtractMask(detail::GetMask(g))[0], + x, delta, 0); + } else { + return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0); + } #endif } -template -EnableIfVectorShuffle SubgroupShuffle(T x, id<1> local_id) { +template +EnableIfVectorShuffle Shuffle(GroupT g, T x, id<1> local_id) { T result; for (int s = 0; s < x.size(); ++s) { - result[s] = SubgroupShuffle(x[s], local_id); + result[s] = Shuffle(g, x[s], local_id); } return result; } -template -EnableIfVectorShuffle SubgroupShuffleXor(T x, id<1> local_id) { +template +EnableIfVectorShuffle ShuffleXor(GroupT g, T x, id<1> local_id) { T result; for (int s = 0; s < x.size(); ++s) { - result[s] = SubgroupShuffleXor(x[s], local_id); + result[s] = ShuffleXor(g, x[s], local_id); } return result; } -template -EnableIfVectorShuffle SubgroupShuffleDown(T x, uint32_t delta) { +template +EnableIfVectorShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { T result; for (int s = 0; s < x.size(); ++s) { - result[s] = SubgroupShuffleDown(x[s], delta); + result[s] = ShuffleDown(g, x[s], delta); } return result; } -template -EnableIfVectorShuffle SubgroupShuffleUp(T x, uint32_t delta) { +template +EnableIfVectorShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { T result; for (int s = 0; s < x.size(); ++s) { - result[s] = SubgroupShuffleUp(x[s], delta); + result[s] = ShuffleUp(g, x[s], delta); } return result; } @@ -885,113 +1032,92 @@ EnableIfVectorShuffle SubgroupShuffleUp(T x, uint32_t delta) { template using ConvertToNativeShuffleType_t = select_cl_scalar_integral_unsigned_t; -template -EnableIfBitcastShuffle SubgroupShuffle(T x, id<1> local_id) { +template +EnableIfBitcastShuffle Shuffle(GroupT g, T x, id<1> local_id) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = sycl::bit_cast(x); -#ifndef __NVPTX__ - ShuffleT Result = __spirv_SubgroupShuffleINTEL( - ShuffleX, static_cast(local_id.get(0))); -#else - ShuffleT Result = - __nvvm_shfl_sync_idx_i32(membermask(), ShuffleX, local_id.get(0), 0x1f); -#endif + ShuffleT Result = Shuffle(g, ShuffleX, local_id); return sycl::bit_cast(Result); } -template -EnableIfBitcastShuffle SubgroupShuffleXor(T x, id<1> local_id) { +template +EnableIfBitcastShuffle ShuffleXor(GroupT g, T x, id<1> local_id) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = sycl::bit_cast(x); -#ifndef __NVPTX__ - ShuffleT Result = __spirv_SubgroupShuffleXorINTEL( - ShuffleX, static_cast(local_id.get(0))); -#else - ShuffleT Result = - __nvvm_shfl_sync_bfly_i32(membermask(), ShuffleX, local_id.get(0), 0x1f); -#endif + ShuffleT Result = ShuffleXor(g, ShuffleX, local_id); return sycl::bit_cast(Result); } -template -EnableIfBitcastShuffle SubgroupShuffleDown(T x, uint32_t delta) { +template +EnableIfBitcastShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = sycl::bit_cast(x); -#ifndef __NVPTX__ - ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(ShuffleX, ShuffleX, delta); -#else - ShuffleT Result = - __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, delta, 0x1f); -#endif + ShuffleT Result = ShuffleDown(g, ShuffleX, delta); return sycl::bit_cast(Result); } -template -EnableIfBitcastShuffle SubgroupShuffleUp(T x, uint32_t delta) { +template +EnableIfBitcastShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = sycl::bit_cast(x); -#ifndef __NVPTX__ - ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(ShuffleX, ShuffleX, delta); -#else - ShuffleT Result = __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, delta, 0); -#endif + ShuffleT Result = ShuffleUp(g, ShuffleX, delta); return sycl::bit_cast(Result); } -template -EnableIfGenericShuffle SubgroupShuffle(T x, id<1> local_id) { +template +EnableIfGenericShuffle Shuffle(GroupT g, T x, id<1> local_id) { T Result; char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); - ShuffleResult = SubgroupShuffle(ShuffleX, local_id); + ShuffleResult = Shuffle(g, ShuffleX, local_id); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); return Result; } -template -EnableIfGenericShuffle SubgroupShuffleXor(T x, id<1> local_id) { +template +EnableIfGenericShuffle ShuffleXor(GroupT g, T x, id<1> local_id) { T Result; char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); - ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id); + ShuffleResult = ShuffleXor(g, ShuffleX, local_id); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); return Result; } -template -EnableIfGenericShuffle SubgroupShuffleDown(T x, uint32_t delta) { +template +EnableIfGenericShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { T Result; char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); - ShuffleResult = SubgroupShuffleDown(ShuffleX, delta); + ShuffleResult = ShuffleDown(g, ShuffleX, delta); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); return Result; } -template -EnableIfGenericShuffle SubgroupShuffleUp(T x, uint32_t delta) { +template +EnableIfGenericShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { T Result; char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); - ShuffleResult = SubgroupShuffleUp(ShuffleX, delta); + ShuffleResult = ShuffleUp(g, ShuffleX, delta); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); @@ -1029,18 +1155,6 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { #endif } -template -struct is_tangle_or_opportunistic_group : std::false_type {}; - -template -struct is_tangle_or_opportunistic_group< - sycl::ext::oneapi::experimental::tangle_group> - : std::true_type {}; - -template <> -struct is_tangle_or_opportunistic_group< - sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {}; - // TODO: Refactor to avoid duplication after design settles #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ template <__spv::GroupOperation Op, typename Group, typename T> \ diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 488053441627c..0aa5a9e2c7501 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -522,14 +522,17 @@ joint_none_of(Group g, Ptr first, Ptr last, Predicate pred) { // TODO: remove check for detail::is_vec once sycl::vec is trivially // copyable. template -std::enable_if_t<(std::is_same_v, sub_group> && +std::enable_if_t<((std::is_same_v, sub_group> || + sycl::ext::oneapi::experimental::is_user_constructed_group_v< + std::decay_t>) && (std::is_trivially_copyable_v || detail::is_vec::value)), T> -shift_group_left(Group, T x, typename Group::linear_id_type delta = 1) { +shift_group_left(Group g, T x, typename Group::linear_id_type delta = 1) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffleDown(x, delta); + return sycl::detail::spirv::ShuffleDown(g, x, delta); #else + (void)g; (void)x; (void)delta; throw sycl::exception(make_error_code(errc::feature_not_supported), @@ -541,14 +544,17 @@ shift_group_left(Group, T x, typename Group::linear_id_type delta = 1) { // TODO: remove check for detail::is_vec once sycl::vec is trivially // copyable. template -std::enable_if_t<(std::is_same_v, sub_group> && +std::enable_if_t<((std::is_same_v, sub_group> || + sycl::ext::oneapi::experimental::is_user_constructed_group_v< + std::decay_t>) && (std::is_trivially_copyable_v || detail::is_vec::value)), T> -shift_group_right(Group, T x, typename Group::linear_id_type delta = 1) { +shift_group_right(Group g, T x, typename Group::linear_id_type delta = 1) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffleUp(x, delta); + return sycl::detail::spirv::ShuffleUp(g, x, delta); #else + (void)g; (void)x; (void)delta; throw sycl::exception(make_error_code(errc::feature_not_supported), @@ -560,14 +566,17 @@ shift_group_right(Group, T x, typename Group::linear_id_type delta = 1) { // TODO: remove check for detail::is_vec once sycl::vec is trivially // copyable. template -std::enable_if_t<(std::is_same_v, sub_group> && +std::enable_if_t<((std::is_same_v, sub_group> || + sycl::ext::oneapi::experimental::is_user_constructed_group_v< + std::decay_t>) && (std::is_trivially_copyable_v || detail::is_vec::value)), T> -permute_group_by_xor(Group, T x, typename Group::linear_id_type mask) { +permute_group_by_xor(Group g, T x, typename Group::linear_id_type mask) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffleXor(x, mask); + return sycl::detail::spirv::ShuffleXor(g, x, mask); #else + (void)g; (void)x; (void)mask; throw sycl::exception(make_error_code(errc::feature_not_supported), @@ -579,14 +588,17 @@ permute_group_by_xor(Group, T x, typename Group::linear_id_type mask) { // TODO: remove check for detail::is_vec once sycl::vec is trivially // copyable. template -std::enable_if_t<(std::is_same_v, sub_group> && +std::enable_if_t<((std::is_same_v, sub_group> || + sycl::ext::oneapi::experimental::is_user_constructed_group_v< + std::decay_t>) && (std::is_trivially_copyable_v || detail::is_vec::value)), T> -select_from_group(Group, T x, typename Group::id_type local_id) { +select_from_group(Group g, T x, typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffle(x, local_id); + return sycl::detail::spirv::Shuffle(g, x, local_id); #else + (void)g; (void)x; (void)local_id; throw sycl::exception(make_error_code(errc::feature_not_supported), diff --git a/sycl/include/sycl/sub_group.hpp b/sycl/include/sycl/sub_group.hpp index 9781195d10c4c..057e22ce7113c 100644 --- a/sycl/include/sycl/sub_group.hpp +++ b/sycl/include/sycl/sub_group.hpp @@ -223,7 +223,7 @@ struct sub_group { __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.") T shuffle(T x, id_type local_id) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffle(x, local_id); + return sycl::detail::spirv::Shuffle(*this, x, local_id); #else (void)x; (void)local_id; @@ -236,7 +236,7 @@ struct sub_group { __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.") T shuffle_down(T x, uint32_t delta) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffleDown(x, delta); + return sycl::detail::spirv::ShuffleDown(*this, x, delta); #else (void)x; (void)delta; @@ -249,7 +249,7 @@ struct sub_group { __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.") T shuffle_up(T x, uint32_t delta) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffleUp(x, delta); + return sycl::detail::spirv::ShuffleUp(*this, x, delta); #else (void)x; (void)delta; @@ -262,7 +262,7 @@ struct sub_group { __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.") T shuffle_xor(T x, id_type value) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::SubgroupShuffleXor(x, value); + return sycl::detail::spirv::ShuffleXor(*this, x, value); #else (void)x; (void)value; diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp index 03cb9e5ba6a7f..a06353383b80f 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -2,7 +2,8 @@ // RUN: %{run} %t.out // // REQUIRES: gpu -// UNSUPPORTED: hip +// REQUIRES: sg-32 +// REQUIRES: aspect-ext_oneapi_ballot_group #include #include @@ -13,13 +14,6 @@ class TestKernel; int main() { sycl::queue Q; - auto SGSizes = Q.get_device().get_info(); - if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { - std::cout << "Test skipped due to missing support for sub-group size 32." - << std::endl; - return 0; - } - sycl::buffer TmpBuf{sycl::range{32}}; sycl::buffer BarrierBuf{sycl::range{32}}; sycl::buffer BroadcastBuf{sycl::range{32}}; @@ -29,6 +23,10 @@ int main() { sycl::buffer ReduceBuf{sycl::range{32}}; sycl::buffer ExScanBuf{sycl::range{32}}; sycl::buffer IncScanBuf{sycl::range{32}}; + sycl::buffer ShiftLeftBuf{sycl::range{32}}; + sycl::buffer ShiftRightBuf{sycl::range{32}}; + sycl::buffer SelectBuf{sycl::range{32}}; + sycl::buffer PermuteXorBuf{sycl::range{32}}; const auto NDR = sycl::nd_range<1>{32, 32}; Q.submit([&](sycl::handler &CGH) { @@ -41,6 +39,10 @@ int main() { sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only}; + sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only}; + sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only}; + sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only}; const auto KernelFunc = [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { auto WI = item.get_global_id(); @@ -49,6 +51,7 @@ int main() { // Split into odd and even work-items. bool Predicate = WI % 2 == 0; auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + uint32_t BallotGroupSize = BallotGroup.get_local_linear_range(); // Check all other members' writes are visible after a barrier. TmpAcc[WI] = 1; @@ -92,8 +95,7 @@ int main() { uint32_t ReduceResult = sycl::reduce_over_group(BallotGroup, 1, sycl::plus<>()); - ReduceAcc[WI] = - (ReduceResult == BallotGroup.get_local_linear_range()); + ReduceAcc[WI] = (ReduceResult == BallotGroupSize); uint32_t ExScanResult = sycl::exclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); @@ -102,6 +104,24 @@ int main() { uint32_t IncScanResult = sycl::inclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); IncScanAcc[WI] = (IncScanResult == LID + 1); + + uint32_t ShiftLeftResult = + sycl::shift_group_left(BallotGroup, LID, 2); + ShiftLeftAcc[WI] = + (LID + 2 >= BallotGroupSize || ShiftLeftResult == LID + 2); + + uint32_t ShiftRightResult = + sycl::shift_group_right(BallotGroup, LID, 2); + ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2); + + uint32_t SelectResult = sycl::select_from_group( + BallotGroup, LID, + (BallotGroup.get_local_id() + 2) % BallotGroupSize); + SelectAcc[WI] = (SelectResult == (LID + 2) % BallotGroupSize); + + uint32_t PermuteXorResult = + sycl::permute_group_by_xor(BallotGroup, LID, 2); + PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ 2)); }; CGH.parallel_for(NDR, KernelFunc); }); @@ -114,6 +134,10 @@ int main() { sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only}; + sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only}; + sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only}; + sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only}; for (int WI = 0; WI < 32; ++WI) { assert(BarrierAcc[WI] == true); assert(BroadcastAcc[WI] == true); @@ -123,6 +147,14 @@ int main() { assert(ReduceAcc[WI] == true); assert(ExScanAcc[WI] == true); assert(IncScanAcc[WI] == true); + // TODO: Enable for CUDA devices when issue with shuffles have been + // addressed. + if (Q.get_backend() != sycl::backend::ext_oneapi_cuda) { + assert(ShiftLeftAcc[WI] == true); + assert(ShiftRightAcc[WI] == true); + assert(SelectAcc[WI] == true); + assert(PermuteXorAcc[WI] == true); + } } return 0; } diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp index 74ec9d24f212a..b83ee45f054da 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -3,7 +3,7 @@ // // REQUIRES: gpu // REQUIRES: sg-32 -// UNSUPPORTED: hip +// REQUIRES: aspect-ext_oneapi_fixed_size_group #include #include @@ -25,6 +25,10 @@ template void test() { sycl::buffer ReduceBuf{sycl::range{SGSize}}; sycl::buffer ExScanBuf{sycl::range{SGSize}}; sycl::buffer IncScanBuf{sycl::range{SGSize}}; + sycl::buffer ShiftLeftBuf{sycl::range{SGSize}}; + sycl::buffer ShiftRightBuf{sycl::range{SGSize}}; + sycl::buffer SelectBuf{sycl::range{SGSize}}; + sycl::buffer PermuteXorBuf{sycl::range{SGSize}}; const auto NDR = sycl::nd_range<1>{SGSize, SGSize}; Q.submit([&](sycl::handler &CGH) { @@ -37,6 +41,10 @@ template void test() { sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only}; + sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only}; + sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only}; + sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only}; const auto KernelFunc = [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] { auto WI = item.get_global_id(); @@ -95,6 +103,23 @@ template void test() { uint32_t IncScanResult = sycl::inclusive_scan_over_group(Partition, 1, sycl::plus<>()); IncScanAcc[WI] = (IncScanResult == LID + 1); + + uint32_t ShiftLeftResult = sycl::shift_group_left(Partition, LID, 2); + ShiftLeftAcc[WI] = + (LID + 2 >= PartitionSize || ShiftLeftResult == LID + 2); + + uint32_t ShiftRightResult = + sycl::shift_group_right(Partition, LID, 2); + ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2); + + uint32_t SelectResult = sycl::select_from_group( + Partition, LID, (Partition.get_local_id() + 2) % PartitionSize); + SelectAcc[WI] = (SelectResult == (LID + 2) % PartitionSize); + + uint32_t Mask = PartitionSize <= 2 ? 0 : 2; + uint32_t PermuteXorResult = + sycl::permute_group_by_xor(Partition, LID, Mask); + PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ Mask)); }; CGH.parallel_for>(NDR, KernelFunc); }); @@ -107,6 +132,10 @@ template void test() { sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only}; + sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only}; + sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only}; + sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only}; for (int WI = 0; WI < SGSize; ++WI) { assert(BarrierAcc[WI] == true); assert(BroadcastAcc[WI] == true); @@ -116,6 +145,10 @@ template void test() { assert(ReduceAcc[WI] == true); assert(ExScanAcc[WI] == true); assert(IncScanAcc[WI] == true); + assert(ShiftLeftAcc[WI] == true); + assert(ShiftRightAcc[WI] == true); + assert(SelectAcc[WI] == true); + assert(PermuteXorAcc[WI] == true); } } diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp index 93636a8156167..4279b58d5c890 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp @@ -2,7 +2,8 @@ // RUN: %{run} %t.out // // REQUIRES: gpu -// UNSUPPORTED: hip +// REQUIRES: sg-32 +// REQUIRES: aspect-ext_oneapi_opportunistic_group #include #include @@ -16,13 +17,6 @@ constexpr uint32_t ArbitraryItem = 5; int main() { sycl::queue Q; - auto SGSizes = Q.get_device().get_info(); - if (std::find(SGSizes.begin(), SGSizes.end(), SGSize) == SGSizes.end()) { - std::cout << "Test skipped due to missing support for sub-group size 32." - << std::endl; - return 0; - } - sycl::buffer TmpBuf{sycl::range{SGSize}}; sycl::buffer BarrierBuf{sycl::range{SGSize}}; sycl::buffer BroadcastBuf{sycl::range{SGSize}}; @@ -32,6 +26,10 @@ int main() { sycl::buffer ReduceBuf{sycl::range{SGSize}}; sycl::buffer ExScanBuf{sycl::range{SGSize}}; sycl::buffer IncScanBuf{sycl::range{SGSize}}; + sycl::buffer ShiftLeftBuf{sycl::range{SGSize}}; + sycl::buffer ShiftRightBuf{sycl::range{SGSize}}; + sycl::buffer SelectBuf{sycl::range{SGSize}}; + sycl::buffer PermuteXorBuf{sycl::range{SGSize}}; const auto NDR = sycl::nd_range<1>{SGSize, SGSize}; Q.submit([&](sycl::handler &CGH) { @@ -44,6 +42,10 @@ int main() { sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only}; + sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only}; + sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only}; + sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only}; const auto KernelFunc = [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] { auto WI = item.get_global_id(); @@ -68,6 +70,8 @@ int main() { // Simple check of group algorithms. uint32_t LID = OpportunisticGroup.get_local_linear_id(); + uint32_t OpportunisticGroupSize = + OpportunisticGroup.get_local_linear_range(); uint32_t BroadcastResult = sycl::group_broadcast(OpportunisticGroup, OriginalLID, 0); @@ -95,6 +99,26 @@ int main() { uint32_t IncScanResult = sycl::inclusive_scan_over_group( OpportunisticGroup, 1, sycl::plus<>()); IncScanAcc[WI] = (IncScanResult == LID + 1); + + uint32_t ShiftLeftResult = + sycl::shift_group_left(OpportunisticGroup, LID, 2); + ShiftLeftAcc[WI] = (LID + 2 >= OpportunisticGroupSize || + ShiftLeftResult == LID + 2); + + uint32_t ShiftRightResult = + sycl::shift_group_right(OpportunisticGroup, LID, 2); + ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2); + + uint32_t SelectResult = sycl::select_from_group( + OpportunisticGroup, LID, + (OpportunisticGroup.get_local_id() + 2) % + OpportunisticGroupSize); + SelectAcc[WI] = + (SelectResult == (LID + 2) % OpportunisticGroupSize); + + uint32_t PermuteXorResult = + sycl::permute_group_by_xor(OpportunisticGroup, LID, 0); + PermuteXorAcc[WI] = (PermuteXorResult == LID); } else { BarrierAcc[WI] = false; BroadcastAcc[WI] = false; @@ -104,6 +128,10 @@ int main() { ReduceAcc[WI] = false; ExScanAcc[WI] = false; IncScanAcc[WI] = false; + ShiftLeftAcc[WI] = false; + ShiftRightAcc[WI] = false; + SelectAcc[WI] = false; + PermuteXorAcc[WI] = false; } }; CGH.parallel_for(NDR, KernelFunc); @@ -117,6 +145,10 @@ int main() { sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only}; + sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only}; + sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only}; + sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only}; for (uint32_t WI = 0; WI < 32; ++WI) { bool ExpectedResult = (WI == ArbitraryItem); assert(BarrierAcc[WI] == ExpectedResult); @@ -127,6 +159,10 @@ int main() { assert(ReduceAcc[WI] == ExpectedResult); assert(ExScanAcc[WI] == ExpectedResult); assert(IncScanAcc[WI] == ExpectedResult); + assert(ShiftLeftAcc[WI] == ExpectedResult); + assert(ShiftRightAcc[WI] == ExpectedResult); + assert(SelectAcc[WI] == ExpectedResult); + assert(PermuteXorAcc[WI] == ExpectedResult); } return 0; } diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp index 2e50b876590db..c98cf1901947f 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp @@ -3,7 +3,8 @@ // // REQUIRES: gpu // REQUIRES: sg-32 -// UNSUPPORTED: cuda || hip || windows +// REQUIRES: aspect-ext_oneapi_tangle_group +// UNSUPPORTED: cuda || windows // Tangle groups exhibit unpredictable behavior on Windows. // The test is disabled while we investigate the root cause. @@ -27,6 +28,10 @@ int main() { sycl::buffer ReduceBuf{sycl::range{SGSize}}; sycl::buffer ExScanBuf{sycl::range{SGSize}}; sycl::buffer IncScanBuf{sycl::range{SGSize}}; + sycl::buffer ShiftLeftBuf{sycl::range{SGSize}}; + sycl::buffer ShiftRightBuf{sycl::range{SGSize}}; + sycl::buffer SelectBuf{sycl::range{SGSize}}; + sycl::buffer PermuteXorBuf{sycl::range{SGSize}}; const auto NDR = sycl::nd_range<1>{SGSize, SGSize}; Q.submit([&](sycl::handler &CGH) { @@ -39,6 +44,10 @@ int main() { sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only}; + sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only}; + sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only}; + sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only}; const auto KernelFunc = [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] { auto WI = item.get_global_id(); @@ -85,6 +94,21 @@ int main() { uint32_t IncScanResult = sycl::inclusive_scan_over_group(Tangle, 1, sycl::plus<>()); IncScanAcc[WI] = (IncScanResult == LID + 1); + + uint32_t ShiftLeftResult = sycl::shift_group_left(Tangle, LID, 2); + ShiftLeftAcc[WI] = + (LID + 2 >= TangleSize || ShiftLeftResult == LID + 2); + + uint32_t ShiftRightResult = sycl::shift_group_right(Tangle, LID, 2); + ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2); + + uint32_t SelectResult = sycl::select_from_group( + Tangle, LID, (Tangle.get_local_id() + 2) % TangleSize); + SelectAcc[WI] = (SelectResult == (LID + 2) % TangleSize); + + uint32_t PermuteXorResult = + sycl::permute_group_by_xor(Tangle, LID, 2); + PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ 2)); }; // Split into three groups of different sizes, using control flow @@ -125,6 +149,10 @@ int main() { sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only}; + sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only}; + sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only}; + sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only}; for (int WI = 0; WI < SGSize; ++WI) { assert(BarrierAcc[WI] == true); assert(BroadcastAcc[WI] == true); @@ -134,6 +162,10 @@ int main() { assert(ReduceAcc[WI] == true); assert(ExScanAcc[WI] == true); assert(IncScanAcc[WI] == true); + assert(ShiftLeftAcc[WI] == true); + assert(ShiftRightAcc[WI] == true); + assert(SelectAcc[WI] == true); + assert(PermuteXorAcc[WI] == true); } return 0; }