diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 586ccd4e6fce6..50da37270e7d3 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -50,6 +50,31 @@ GetMultiPtrDecoratedAs(multi_ptr<FromT, Space, IsDecorated> MPtr) {
 
 namespace spirv {
 
+template <typename Group>
+struct is_tangle_or_opportunistic_group : std::false_type {};
+
+template <typename ParentGroup>
+struct is_tangle_or_opportunistic_group<
+    sycl::ext::oneapi::experimental::tangle_group<ParentGroup>>
+    : std::true_type {};
+
+template <>
+struct is_tangle_or_opportunistic_group<
+    sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {};
+
+template <typename Group> struct is_ballot_group : std::false_type {};
+
+template <typename ParentGroup>
+struct is_ballot_group<
+    sycl::ext::oneapi::experimental::ballot_group<ParentGroup>>
+    : std::true_type {};
+
+template <typename Group> struct is_fixed_size_group : std::false_type {};
+
+template <size_t PartitionSize, typename ParentGroup>
+struct is_fixed_size_group<sycl::ext::oneapi::experimental::fixed_size_group<
+    PartitionSize, ParentGroup>> : std::true_type {};
+
 template <typename Group> struct group_scope {};
 
 template <int Dimensions>
@@ -260,6 +285,8 @@ template <> struct GroupId<::sycl::ext::oneapi::sub_group> {
 template <> struct GroupId<::sycl::sub_group> {
   using type = uint32_t;
 };
+
+// Consolidated function for converting group arguments to OpenCL types.
 template <typename Group, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
   auto GroupLocalId = static_cast<typename GroupId<Group>::type>(local_id);
@@ -268,6 +295,7 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
   auto OCLId = detail::convertToOpenCLType(GroupLocalId);
   return __spirv_GroupBroadcast(group_scope<Group>::value, WideOCLX, OCLId);
 }
+
 template <typename ParentGroup, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT>
 GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
@@ -781,103 +809,222 @@ inline uint32_t membermask() {
 }
 #endif
 
+template <typename GroupT>
+inline uint32_t MapShuffleID(GroupT g, id<1> local_id) {
+  if constexpr (is_tangle_or_opportunistic_group<GroupT>::value ||
+                is_ballot_group<GroupT>::value)
+    return detail::IdToMaskPosition(g, local_id);
+  else if constexpr (is_fixed_size_group<GroupT>::value)
+    return g.get_group_linear_id() * g.get_local_range().size() + local_id;
+  else
+    return local_id.get(0);
+}
+
 // Forward declarations for template overloadings
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffle(T x, id<1> local_id);
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> Shuffle(GroupT g, T x, id<1> local_id);
 
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleXor(T x, id<1> local_id);
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id);
 
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, uint32_t delta);
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta);
 
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, uint32_t delta);
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta);
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id);
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> Shuffle(GroupT g, T x, id<1> local_id);
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id);
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id);
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, uint32_t delta);
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta);
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, uint32_t delta);
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta);
 
-template <typename T>
-EnableIfNativeShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfNativeShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
+  uint32_t LocalId = MapShuffleID(g, local_id);
 #ifndef __NVPTX__
-  return __spirv_SubgroupShuffleINTEL(convertToOpenCLType(x),
-                                      static_cast<uint32_t>(local_id.get(0)));
+  std::ignore = g;
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT> &&
+                detail::is_vec<T>::value) {
+    // Temporary work-around due to a bug in IGC.
+    // TODO: Remove when IGC bug is fixed.
+    T result;
+    for (int s = 0; s < x.size(); ++s)
+      result[s] = Shuffle(g, x[s], local_id);
+    return result;
+  } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                           GroupT>) {
+    return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
+                                          convertToOpenCLType(x), LocalId);
+  } else {
+    // Subgroup.
+    return __spirv_SubgroupShuffleINTEL(convertToOpenCLType(x), LocalId);
+  }
 #else
-  return __nvvm_shfl_sync_idx_i32(membermask(), x, local_id.get(0), 0x1f);
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT>) {
+    return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0],
+                                    x, LocalId, 0x1f);
+  } else {
+    return __nvvm_shfl_sync_idx_i32(membermask(), x, LocalId, 0x1f);
+  }
 #endif
 }
 
-template <typename T>
-EnableIfNativeShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfNativeShuffle<T> ShuffleXor(GroupT g, T x, id<1> mask) {
 #ifndef __NVPTX__
-  return __spirv_SubgroupShuffleXorINTEL(
-      convertToOpenCLType(x), static_cast<uint32_t>(local_id.get(0)));
+  std::ignore = g;
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT> &&
+                detail::is_vec<T>::value) {
+    // Temporary work-around due to a bug in IGC.
+    // TODO: Remove when IGC bug is fixed.
+    T result;
+    for (int s = 0; s < x.size(); ++s)
+      result[s] = ShuffleXor(g, x[s], mask);
+    return result;
+  } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                           GroupT>) {
+    // Since the masks are relative to the groups, we could either try to adjust
+    // the mask or simply do the xor ourselves. Latter option is efficient,
+    // general, and simple so we go with that.
+    id<1> TargetLocalId = g.get_local_id() ^ mask;
+    uint32_t TargetId = MapShuffleID(g, TargetLocalId);
+    return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
+                                          convertToOpenCLType(x), TargetId);
+  } else {
+    // Subgroup.
+    return __spirv_SubgroupShuffleXorINTEL(convertToOpenCLType(x),
+                                           static_cast<uint32_t>(mask.get(0)));
+  }
 #else
-  return __nvvm_shfl_sync_bfly_i32(membermask(), x, local_id.get(0), 0x1f);
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT>) {
+    return __nvvm_shfl_sync_bfly_i32(detail::ExtractMask(detail::GetMask(g))[0],
+                                     x, static_cast<uint32_t>(mask.get(0)),
+                                     0x1f);
+  } else {
+    return __nvvm_shfl_sync_bfly_i32(membermask(), x,
+                                     static_cast<uint32_t>(mask.get(0)), 0x1f);
+  }
 #endif
 }
 
-template <typename T>
-EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfNativeShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
 #ifndef __NVPTX__
-  return __spirv_SubgroupShuffleDownINTEL(convertToOpenCLType(x),
-                                          convertToOpenCLType(x), delta);
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT> &&
+                detail::is_vec<T>::value) {
+    // Temporary work-around due to a bug in IGC.
+    // TODO: Remove when IGC bug is fixed.
+    T result;
+    for (int s = 0; s < x.size(); ++s)
+      result[s] = ShuffleDown(g, x[s], delta);
+    return result;
+  } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                           GroupT>) {
+    id<1> TargetLocalId = g.get_local_id();
+    // ID outside the group range is UB, so we just keep the current item ID
+    // unchanged.
+    if (TargetLocalId[0] + delta < g.get_local_linear_range())
+      TargetLocalId[0] += delta;
+    uint32_t TargetId = MapShuffleID(g, TargetLocalId);
+    return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
+                                          convertToOpenCLType(x), TargetId);
+  } else {
+    // Subgroup.
+    return __spirv_SubgroupShuffleDownINTEL(convertToOpenCLType(x),
+                                            convertToOpenCLType(x), delta);
+  }
 #else
-  return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f);
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT>) {
+    return __nvvm_shfl_sync_down_i32(detail::ExtractMask(detail::GetMask(g))[0],
+                                     x, delta, 0x1f);
+  } else {
+    return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f);
+  }
 #endif
 }
 
-template <typename T>
-EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfNativeShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
 #ifndef __NVPTX__
-  return __spirv_SubgroupShuffleUpINTEL(convertToOpenCLType(x),
-                                        convertToOpenCLType(x), delta);
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT> &&
+                detail::is_vec<T>::value) {
+    // Temporary work-around due to a bug in IGC.
+    // TODO: Remove when IGC bug is fixed.
+    T result;
+    for (int s = 0; s < x.size(); ++s)
+      result[s] = ShuffleUp(g, x[s], delta);
+    return result;
+  } else if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                           GroupT>) {
+    id<1> TargetLocalId = g.get_local_id();
+    // Underflow is UB, so we just keep the current item ID unchanged.
+    if (TargetLocalId[0] >= delta)
+      TargetLocalId[0] -= delta;
+    uint32_t TargetId = MapShuffleID(g, TargetLocalId);
+    return __spirv_GroupNonUniformShuffle(group_scope<GroupT>::value,
+                                          convertToOpenCLType(x), TargetId);
+  } else {
+    // Subgroup.
+    return __spirv_SubgroupShuffleUpINTEL(convertToOpenCLType(x),
+                                          convertToOpenCLType(x), delta);
+  }
 #else
-  return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0);
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<
+                    GroupT>) {
+    return __nvvm_shfl_sync_up_i32(detail::ExtractMask(detail::GetMask(g))[0],
+                                   x, delta, 0);
+  } else {
+    return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0);
+  }
 #endif
 }
 
-template <typename T>
-EnableIfVectorShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfVectorShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
   T result;
   for (int s = 0; s < x.size(); ++s) {
-    result[s] = SubgroupShuffle(x[s], local_id);
+    result[s] = Shuffle(g, x[s], local_id);
   }
   return result;
 }
 
-template <typename T>
-EnableIfVectorShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfVectorShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
   T result;
   for (int s = 0; s < x.size(); ++s) {
-    result[s] = SubgroupShuffleXor(x[s], local_id);
+    result[s] = ShuffleXor(g, x[s], local_id);
   }
   return result;
 }
 
-template <typename T>
-EnableIfVectorShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfVectorShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
   T result;
   for (int s = 0; s < x.size(); ++s) {
-    result[s] = SubgroupShuffleDown(x[s], delta);
+    result[s] = ShuffleDown(g, x[s], delta);
   }
   return result;
 }
 
-template <typename T>
-EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfVectorShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
   T result;
   for (int s = 0; s < x.size(); ++s) {
-    result[s] = SubgroupShuffleUp(x[s], delta);
+    result[s] = ShuffleUp(g, x[s], delta);
   }
   return result;
 }
@@ -885,113 +1032,92 @@ EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
 template <typename T>
 using ConvertToNativeShuffleType_t = select_cl_scalar_integral_unsigned_t<T>;
 
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
-#ifndef __NVPTX__
-  ShuffleT Result = __spirv_SubgroupShuffleINTEL(
-      ShuffleX, static_cast<uint32_t>(local_id.get(0)));
-#else
-  ShuffleT Result =
-      __nvvm_shfl_sync_idx_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
-#endif
+  ShuffleT Result = Shuffle(g, ShuffleX, local_id);
   return sycl::bit_cast<T>(Result);
 }
 
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
-#ifndef __NVPTX__
-  ShuffleT Result = __spirv_SubgroupShuffleXorINTEL(
-      ShuffleX, static_cast<uint32_t>(local_id.get(0)));
-#else
-  ShuffleT Result =
-      __nvvm_shfl_sync_bfly_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
-#endif
+  ShuffleT Result = ShuffleXor(g, ShuffleX, local_id);
   return sycl::bit_cast<T>(Result);
 }
 
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
-#ifndef __NVPTX__
-  ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(ShuffleX, ShuffleX, delta);
-#else
-  ShuffleT Result =
-      __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, delta, 0x1f);
-#endif
+  ShuffleT Result = ShuffleDown(g, ShuffleX, delta);
   return sycl::bit_cast<T>(Result);
 }
 
-template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfBitcastShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = sycl::bit_cast<ShuffleT>(x);
-#ifndef __NVPTX__
-  ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(ShuffleX, ShuffleX, delta);
-#else
-  ShuffleT Result = __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, delta, 0);
-#endif
+  ShuffleT Result = ShuffleUp(g, ShuffleX, delta);
   return sycl::bit_cast<T>(Result);
 }
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> Shuffle(GroupT g, T x, id<1> local_id) {
   T Result;
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
     ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
-    ShuffleResult = SubgroupShuffle(ShuffleX, local_id);
+    ShuffleResult = Shuffle(g, ShuffleX, local_id);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
   };
   GenericCall<T>(ShuffleBytes);
   return Result;
 }
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> ShuffleXor(GroupT g, T x, id<1> local_id) {
   T Result;
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
     ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
-    ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id);
+    ShuffleResult = ShuffleXor(g, ShuffleX, local_id);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
   };
   GenericCall<T>(ShuffleBytes);
   return Result;
 }
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> ShuffleDown(GroupT g, T x, uint32_t delta) {
   T Result;
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
     ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
-    ShuffleResult = SubgroupShuffleDown(ShuffleX, delta);
+    ShuffleResult = ShuffleDown(g, ShuffleX, delta);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
   };
   GenericCall<T>(ShuffleBytes);
   return Result;
 }
 
-template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
+template <typename GroupT, typename T>
+EnableIfGenericShuffle<T> ShuffleUp(GroupT g, T x, uint32_t delta) {
   T Result;
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
     ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
-    ShuffleResult = SubgroupShuffleUp(ShuffleX, delta);
+    ShuffleResult = ShuffleUp(g, ShuffleX, delta);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
   };
   GenericCall<T>(ShuffleBytes);
@@ -1029,18 +1155,6 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
 #endif
 }
 
-template <typename Group>
-struct is_tangle_or_opportunistic_group : std::false_type {};
-
-template <typename ParentGroup>
-struct is_tangle_or_opportunistic_group<
-    sycl::ext::oneapi::experimental::tangle_group<ParentGroup>>
-    : std::true_type {};
-
-template <>
-struct is_tangle_or_opportunistic_group<
-    sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {};
-
 // TODO: Refactor to avoid duplication after design settles
 #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction)                          \
   template <__spv::GroupOperation Op, typename Group, typename T>              \
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 488053441627c..0aa5a9e2c7501 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -522,14 +522,17 @@ joint_none_of(Group g, Ptr first, Ptr last, Predicate pred) {
 // TODO: remove check for detail::is_vec<T> once sycl::vec is trivially
 // copyable.
 template <typename Group, typename T>
-std::enable_if_t<(std::is_same_v<std::decay_t<Group>, sub_group> &&
+std::enable_if_t<((std::is_same_v<std::decay_t<Group>, sub_group> ||
+                   sycl::ext::oneapi::experimental::is_user_constructed_group_v<
+                       std::decay_t<Group>>) &&
                   (std::is_trivially_copyable_v<T> ||
                    detail::is_vec<T>::value)),
                  T>
-shift_group_left(Group, T x, typename Group::linear_id_type delta = 1) {
+shift_group_left(Group g, T x, typename Group::linear_id_type delta = 1) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::SubgroupShuffleDown(x, delta);
+  return sycl::detail::spirv::ShuffleDown(g, x, delta);
 #else
+  (void)g;
   (void)x;
   (void)delta;
   throw sycl::exception(make_error_code(errc::feature_not_supported),
@@ -541,14 +544,17 @@ shift_group_left(Group, T x, typename Group::linear_id_type delta = 1) {
 // TODO: remove check for detail::is_vec<T> once sycl::vec is trivially
 // copyable.
 template <typename Group, typename T>
-std::enable_if_t<(std::is_same_v<std::decay_t<Group>, sub_group> &&
+std::enable_if_t<((std::is_same_v<std::decay_t<Group>, sub_group> ||
+                   sycl::ext::oneapi::experimental::is_user_constructed_group_v<
+                       std::decay_t<Group>>) &&
                   (std::is_trivially_copyable_v<T> ||
                    detail::is_vec<T>::value)),
                  T>
-shift_group_right(Group, T x, typename Group::linear_id_type delta = 1) {
+shift_group_right(Group g, T x, typename Group::linear_id_type delta = 1) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::SubgroupShuffleUp(x, delta);
+  return sycl::detail::spirv::ShuffleUp(g, x, delta);
 #else
+  (void)g;
   (void)x;
   (void)delta;
   throw sycl::exception(make_error_code(errc::feature_not_supported),
@@ -560,14 +566,17 @@ shift_group_right(Group, T x, typename Group::linear_id_type delta = 1) {
 // TODO: remove check for detail::is_vec<T> once sycl::vec is trivially
 // copyable.
 template <typename Group, typename T>
-std::enable_if_t<(std::is_same_v<std::decay_t<Group>, sub_group> &&
+std::enable_if_t<((std::is_same_v<std::decay_t<Group>, sub_group> ||
+                   sycl::ext::oneapi::experimental::is_user_constructed_group_v<
+                       std::decay_t<Group>>) &&
                   (std::is_trivially_copyable_v<T> ||
                    detail::is_vec<T>::value)),
                  T>
-permute_group_by_xor(Group, T x, typename Group::linear_id_type mask) {
+permute_group_by_xor(Group g, T x, typename Group::linear_id_type mask) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::SubgroupShuffleXor(x, mask);
+  return sycl::detail::spirv::ShuffleXor(g, x, mask);
 #else
+  (void)g;
   (void)x;
   (void)mask;
   throw sycl::exception(make_error_code(errc::feature_not_supported),
@@ -579,14 +588,17 @@ permute_group_by_xor(Group, T x, typename Group::linear_id_type mask) {
 // TODO: remove check for detail::is_vec<T> once sycl::vec is trivially
 // copyable.
 template <typename Group, typename T>
-std::enable_if_t<(std::is_same_v<std::decay_t<Group>, sub_group> &&
+std::enable_if_t<((std::is_same_v<std::decay_t<Group>, sub_group> ||
+                   sycl::ext::oneapi::experimental::is_user_constructed_group_v<
+                       std::decay_t<Group>>) &&
                   (std::is_trivially_copyable_v<T> ||
                    detail::is_vec<T>::value)),
                  T>
-select_from_group(Group, T x, typename Group::id_type local_id) {
+select_from_group(Group g, T x, typename Group::id_type local_id) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::SubgroupShuffle(x, local_id);
+  return sycl::detail::spirv::Shuffle(g, x, local_id);
 #else
+  (void)g;
   (void)x;
   (void)local_id;
   throw sycl::exception(make_error_code(errc::feature_not_supported),
diff --git a/sycl/include/sycl/sub_group.hpp b/sycl/include/sycl/sub_group.hpp
index 9781195d10c4c..057e22ce7113c 100644
--- a/sycl/include/sycl/sub_group.hpp
+++ b/sycl/include/sycl/sub_group.hpp
@@ -223,7 +223,7 @@ struct sub_group {
   __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.")
   T shuffle(T x, id_type local_id) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::spirv::SubgroupShuffle(x, local_id);
+    return sycl::detail::spirv::Shuffle(*this, x, local_id);
 #else
     (void)x;
     (void)local_id;
@@ -236,7 +236,7 @@ struct sub_group {
   __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.")
   T shuffle_down(T x, uint32_t delta) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::spirv::SubgroupShuffleDown(x, delta);
+    return sycl::detail::spirv::ShuffleDown(*this, x, delta);
 #else
     (void)x;
     (void)delta;
@@ -249,7 +249,7 @@ struct sub_group {
   __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.")
   T shuffle_up(T x, uint32_t delta) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::spirv::SubgroupShuffleUp(x, delta);
+    return sycl::detail::spirv::ShuffleUp(*this, x, delta);
 #else
     (void)x;
     (void)delta;
@@ -262,7 +262,7 @@ struct sub_group {
   __SYCL_DEPRECATED("Shuffles in the sub-group class are deprecated.")
   T shuffle_xor(T x, id_type value) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::spirv::SubgroupShuffleXor(x, value);
+    return sycl::detail::spirv::ShuffleXor(*this, x, value);
 #else
     (void)x;
     (void)value;
diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
index 03cb9e5ba6a7f..a06353383b80f 100644
--- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
@@ -2,7 +2,8 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: hip
+// REQUIRES: sg-32
+// REQUIRES: aspect-ext_oneapi_ballot_group
 
 #include <sycl/sycl.hpp>
 #include <vector>
@@ -13,13 +14,6 @@ class TestKernel;
 int main() {
   sycl::queue Q;
 
-  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
-  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
-    std::cout << "Test skipped due to missing support for sub-group size 32."
-              << std::endl;
-    return 0;
-  }
-
   sycl::buffer<size_t, 1> TmpBuf{sycl::range{32}};
   sycl::buffer<bool, 1> BarrierBuf{sycl::range{32}};
   sycl::buffer<bool, 1> BroadcastBuf{sycl::range{32}};
@@ -29,6 +23,10 @@ int main() {
   sycl::buffer<bool, 1> ReduceBuf{sycl::range{32}};
   sycl::buffer<bool, 1> ExScanBuf{sycl::range{32}};
   sycl::buffer<bool, 1> IncScanBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> ShiftLeftBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> ShiftRightBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> SelectBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> PermuteXorBuf{sycl::range{32}};
 
   const auto NDR = sycl::nd_range<1>{32, 32};
   Q.submit([&](sycl::handler &CGH) {
@@ -41,6 +39,10 @@ int main() {
     sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
     sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
     sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only};
+    sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only};
+    sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only};
     const auto KernelFunc =
         [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
           auto WI = item.get_global_id();
@@ -49,6 +51,7 @@ int main() {
           // Split into odd and even work-items.
           bool Predicate = WI % 2 == 0;
           auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
+          uint32_t BallotGroupSize = BallotGroup.get_local_linear_range();
 
           // Check all other members' writes are visible after a barrier.
           TmpAcc[WI] = 1;
@@ -92,8 +95,7 @@ int main() {
 
           uint32_t ReduceResult =
               sycl::reduce_over_group(BallotGroup, 1, sycl::plus<>());
-          ReduceAcc[WI] =
-              (ReduceResult == BallotGroup.get_local_linear_range());
+          ReduceAcc[WI] = (ReduceResult == BallotGroupSize);
 
           uint32_t ExScanResult =
               sycl::exclusive_scan_over_group(BallotGroup, 1, sycl::plus<>());
@@ -102,6 +104,24 @@ int main() {
           uint32_t IncScanResult =
               sycl::inclusive_scan_over_group(BallotGroup, 1, sycl::plus<>());
           IncScanAcc[WI] = (IncScanResult == LID + 1);
+
+          uint32_t ShiftLeftResult =
+              sycl::shift_group_left(BallotGroup, LID, 2);
+          ShiftLeftAcc[WI] =
+              (LID + 2 >= BallotGroupSize || ShiftLeftResult == LID + 2);
+
+          uint32_t ShiftRightResult =
+              sycl::shift_group_right(BallotGroup, LID, 2);
+          ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2);
+
+          uint32_t SelectResult = sycl::select_from_group(
+              BallotGroup, LID,
+              (BallotGroup.get_local_id() + 2) % BallotGroupSize);
+          SelectAcc[WI] = (SelectResult == (LID + 2) % BallotGroupSize);
+
+          uint32_t PermuteXorResult =
+              sycl::permute_group_by_xor(BallotGroup, LID, 2);
+          PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ 2));
         };
     CGH.parallel_for<TestKernel>(NDR, KernelFunc);
   });
@@ -114,6 +134,10 @@ int main() {
   sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
   sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
   sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
+  sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only};
+  sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only};
+  sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only};
+  sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only};
   for (int WI = 0; WI < 32; ++WI) {
     assert(BarrierAcc[WI] == true);
     assert(BroadcastAcc[WI] == true);
@@ -123,6 +147,14 @@ int main() {
     assert(ReduceAcc[WI] == true);
     assert(ExScanAcc[WI] == true);
     assert(IncScanAcc[WI] == true);
+    // TODO: Enable for CUDA devices when issue with shuffles have been
+    // addressed.
+    if (Q.get_backend() != sycl::backend::ext_oneapi_cuda) {
+      assert(ShiftLeftAcc[WI] == true);
+      assert(ShiftRightAcc[WI] == true);
+      assert(SelectAcc[WI] == true);
+      assert(PermuteXorAcc[WI] == true);
+    }
   }
   return 0;
 }
diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
index 74ec9d24f212a..b83ee45f054da 100644
--- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
@@ -3,7 +3,7 @@
 //
 // REQUIRES: gpu
 // REQUIRES: sg-32
-// UNSUPPORTED: hip
+// REQUIRES: aspect-ext_oneapi_fixed_size_group
 
 #include <sycl/sycl.hpp>
 #include <vector>
@@ -25,6 +25,10 @@ template <size_t PartitionSize> void test() {
   sycl::buffer<bool, 1> ReduceBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> ExScanBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> IncScanBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> ShiftLeftBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> ShiftRightBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> SelectBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> PermuteXorBuf{sycl::range{SGSize}};
 
   const auto NDR = sycl::nd_range<1>{SGSize, SGSize};
   Q.submit([&](sycl::handler &CGH) {
@@ -37,6 +41,10 @@ template <size_t PartitionSize> void test() {
     sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
     sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
     sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only};
+    sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only};
+    sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only};
     const auto KernelFunc =
         [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] {
           auto WI = item.get_global_id();
@@ -95,6 +103,23 @@ template <size_t PartitionSize> void test() {
           uint32_t IncScanResult =
               sycl::inclusive_scan_over_group(Partition, 1, sycl::plus<>());
           IncScanAcc[WI] = (IncScanResult == LID + 1);
+
+          uint32_t ShiftLeftResult = sycl::shift_group_left(Partition, LID, 2);
+          ShiftLeftAcc[WI] =
+              (LID + 2 >= PartitionSize || ShiftLeftResult == LID + 2);
+
+          uint32_t ShiftRightResult =
+              sycl::shift_group_right(Partition, LID, 2);
+          ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2);
+
+          uint32_t SelectResult = sycl::select_from_group(
+              Partition, LID, (Partition.get_local_id() + 2) % PartitionSize);
+          SelectAcc[WI] = (SelectResult == (LID + 2) % PartitionSize);
+
+          uint32_t Mask = PartitionSize <= 2 ? 0 : 2;
+          uint32_t PermuteXorResult =
+              sycl::permute_group_by_xor(Partition, LID, Mask);
+          PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ Mask));
         };
     CGH.parallel_for<TestKernel<PartitionSize>>(NDR, KernelFunc);
   });
@@ -107,6 +132,10 @@ template <size_t PartitionSize> void test() {
   sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
   sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
   sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
+  sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only};
+  sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only};
+  sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only};
+  sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only};
   for (int WI = 0; WI < SGSize; ++WI) {
     assert(BarrierAcc[WI] == true);
     assert(BroadcastAcc[WI] == true);
@@ -116,6 +145,10 @@ template <size_t PartitionSize> void test() {
     assert(ReduceAcc[WI] == true);
     assert(ExScanAcc[WI] == true);
     assert(IncScanAcc[WI] == true);
+    assert(ShiftLeftAcc[WI] == true);
+    assert(ShiftRightAcc[WI] == true);
+    assert(SelectAcc[WI] == true);
+    assert(PermuteXorAcc[WI] == true);
   }
 }
 
diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
index 93636a8156167..4279b58d5c890 100644
--- a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
@@ -2,7 +2,8 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: hip
+// REQUIRES: sg-32
+// REQUIRES: aspect-ext_oneapi_opportunistic_group
 
 #include <sycl/sycl.hpp>
 #include <vector>
@@ -16,13 +17,6 @@ constexpr uint32_t ArbitraryItem = 5;
 int main() {
   sycl::queue Q;
 
-  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
-  if (std::find(SGSizes.begin(), SGSizes.end(), SGSize) == SGSizes.end()) {
-    std::cout << "Test skipped due to missing support for sub-group size 32."
-              << std::endl;
-    return 0;
-  }
-
   sycl::buffer<size_t, 1> TmpBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> BarrierBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> BroadcastBuf{sycl::range{SGSize}};
@@ -32,6 +26,10 @@ int main() {
   sycl::buffer<bool, 1> ReduceBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> ExScanBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> IncScanBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> ShiftLeftBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> ShiftRightBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> SelectBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> PermuteXorBuf{sycl::range{SGSize}};
 
   const auto NDR = sycl::nd_range<1>{SGSize, SGSize};
   Q.submit([&](sycl::handler &CGH) {
@@ -44,6 +42,10 @@ int main() {
     sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
     sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
     sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only};
+    sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only};
+    sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only};
     const auto KernelFunc =
         [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] {
           auto WI = item.get_global_id();
@@ -68,6 +70,8 @@ int main() {
 
             // Simple check of group algorithms.
             uint32_t LID = OpportunisticGroup.get_local_linear_id();
+            uint32_t OpportunisticGroupSize =
+                OpportunisticGroup.get_local_linear_range();
 
             uint32_t BroadcastResult =
                 sycl::group_broadcast(OpportunisticGroup, OriginalLID, 0);
@@ -95,6 +99,26 @@ int main() {
             uint32_t IncScanResult = sycl::inclusive_scan_over_group(
                 OpportunisticGroup, 1, sycl::plus<>());
             IncScanAcc[WI] = (IncScanResult == LID + 1);
+
+            uint32_t ShiftLeftResult =
+                sycl::shift_group_left(OpportunisticGroup, LID, 2);
+            ShiftLeftAcc[WI] = (LID + 2 >= OpportunisticGroupSize ||
+                                ShiftLeftResult == LID + 2);
+
+            uint32_t ShiftRightResult =
+                sycl::shift_group_right(OpportunisticGroup, LID, 2);
+            ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2);
+
+            uint32_t SelectResult = sycl::select_from_group(
+                OpportunisticGroup, LID,
+                (OpportunisticGroup.get_local_id() + 2) %
+                    OpportunisticGroupSize);
+            SelectAcc[WI] =
+                (SelectResult == (LID + 2) % OpportunisticGroupSize);
+
+            uint32_t PermuteXorResult =
+                sycl::permute_group_by_xor(OpportunisticGroup, LID, 0);
+            PermuteXorAcc[WI] = (PermuteXorResult == LID);
           } else {
             BarrierAcc[WI] = false;
             BroadcastAcc[WI] = false;
@@ -104,6 +128,10 @@ int main() {
             ReduceAcc[WI] = false;
             ExScanAcc[WI] = false;
             IncScanAcc[WI] = false;
+            ShiftLeftAcc[WI] = false;
+            ShiftRightAcc[WI] = false;
+            SelectAcc[WI] = false;
+            PermuteXorAcc[WI] = false;
           }
         };
     CGH.parallel_for<TestKernel>(NDR, KernelFunc);
@@ -117,6 +145,10 @@ int main() {
   sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
   sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
   sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
+  sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only};
+  sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only};
+  sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only};
+  sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only};
   for (uint32_t WI = 0; WI < 32; ++WI) {
     bool ExpectedResult = (WI == ArbitraryItem);
     assert(BarrierAcc[WI] == ExpectedResult);
@@ -127,6 +159,10 @@ int main() {
     assert(ReduceAcc[WI] == ExpectedResult);
     assert(ExScanAcc[WI] == ExpectedResult);
     assert(IncScanAcc[WI] == ExpectedResult);
+    assert(ShiftLeftAcc[WI] == ExpectedResult);
+    assert(ShiftRightAcc[WI] == ExpectedResult);
+    assert(SelectAcc[WI] == ExpectedResult);
+    assert(PermuteXorAcc[WI] == ExpectedResult);
   }
   return 0;
 }
diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp
index 2e50b876590db..c98cf1901947f 100644
--- a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp
@@ -3,7 +3,8 @@
 //
 // REQUIRES: gpu
 // REQUIRES: sg-32
-// UNSUPPORTED: cuda || hip || windows
+// REQUIRES: aspect-ext_oneapi_tangle_group
+// UNSUPPORTED: cuda || windows
 // Tangle groups exhibit unpredictable behavior on Windows.
 // The test is disabled while we investigate the root cause.
 
@@ -27,6 +28,10 @@ int main() {
   sycl::buffer<bool, 1> ReduceBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> ExScanBuf{sycl::range{SGSize}};
   sycl::buffer<bool, 1> IncScanBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> ShiftLeftBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> ShiftRightBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> SelectBuf{sycl::range{SGSize}};
+  sycl::buffer<bool, 1> PermuteXorBuf{sycl::range{SGSize}};
 
   const auto NDR = sycl::nd_range<1>{SGSize, SGSize};
   Q.submit([&](sycl::handler &CGH) {
@@ -39,6 +44,10 @@ int main() {
     sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
     sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
     sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftLeftAcc{ShiftLeftBuf, CGH, sycl::write_only};
+    sycl::accessor ShiftRightAcc{ShiftRightBuf, CGH, sycl::write_only};
+    sycl::accessor SelectAcc{SelectBuf, CGH, sycl::write_only};
+    sycl::accessor PermuteXorAcc{PermuteXorBuf, CGH, sycl::write_only};
     const auto KernelFunc =
         [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] {
           auto WI = item.get_global_id();
@@ -85,6 +94,21 @@ int main() {
             uint32_t IncScanResult =
                 sycl::inclusive_scan_over_group(Tangle, 1, sycl::plus<>());
             IncScanAcc[WI] = (IncScanResult == LID + 1);
+
+            uint32_t ShiftLeftResult = sycl::shift_group_left(Tangle, LID, 2);
+            ShiftLeftAcc[WI] =
+                (LID + 2 >= TangleSize || ShiftLeftResult == LID + 2);
+
+            uint32_t ShiftRightResult = sycl::shift_group_right(Tangle, LID, 2);
+            ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2);
+
+            uint32_t SelectResult = sycl::select_from_group(
+                Tangle, LID, (Tangle.get_local_id() + 2) % TangleSize);
+            SelectAcc[WI] = (SelectResult == (LID + 2) % TangleSize);
+
+            uint32_t PermuteXorResult =
+                sycl::permute_group_by_xor(Tangle, LID, 2);
+            PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ 2));
           };
 
           // Split into three groups of different sizes, using control flow
@@ -125,6 +149,10 @@ int main() {
   sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
   sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
   sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
+  sycl::host_accessor ShiftLeftAcc{ShiftLeftBuf, sycl::read_only};
+  sycl::host_accessor ShiftRightAcc{ShiftRightBuf, sycl::read_only};
+  sycl::host_accessor SelectAcc{SelectBuf, sycl::read_only};
+  sycl::host_accessor PermuteXorAcc{PermuteXorBuf, sycl::read_only};
   for (int WI = 0; WI < SGSize; ++WI) {
     assert(BarrierAcc[WI] == true);
     assert(BroadcastAcc[WI] == true);
@@ -134,6 +162,10 @@ int main() {
     assert(ReduceAcc[WI] == true);
     assert(ExScanAcc[WI] == true);
     assert(IncScanAcc[WI] == true);
+    assert(ShiftLeftAcc[WI] == true);
+    assert(ShiftRightAcc[WI] == true);
+    assert(SelectAcc[WI] == true);
+    assert(PermuteXorAcc[WI] == true);
   }
   return 0;
 }