diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
index 209fd86411369..e86e2a9a9d969 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
@@ -645,6 +645,104 @@ __esimd_lsc_xatomic_bti_2(
 }
 #endif // __SYCL_DEVICE_ONLY__
 
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam InternalOp is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the SIMD size of operation (the number of addresses to access)
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+template <typename Ty, int InternalOpOp, __ESIMD_NS::cache_hint L1H,
+          __ESIMD_NS::cache_hint L2H, uint16_t AddressScale, int ImmOffset,
+          __ESIMD_DNS::lsc_data_size DS, __ESIMD_DNS::lsc_vector_size VS,
+          __ESIMD_DNS::lsc_data_order Transposed, int N>
+__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()>
+__esimd_lsc_xatomic_slm_0(__ESIMD_DNS::simd_mask_storage_t<N> pred,
+                          __ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  __ESIMD_UNSUPPORTED_ON_HOST;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam InternalOp is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the SIMD size of operation (the number of addresses to access)
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+template <typename Ty, int InternalOp, __ESIMD_NS::cache_hint L1H,
+          __ESIMD_NS::cache_hint L2H, uint16_t AddressScale, int ImmOffset,
+          __ESIMD_DNS::lsc_data_size DS, __ESIMD_DNS::lsc_vector_size VS,
+          __ESIMD_DNS::lsc_data_order Transposed, int N>
+__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()>
+__esimd_lsc_xatomic_slm_1(
+    __ESIMD_DNS::simd_mask_storage_t<N> pred,
+    __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
+    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> src0)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  __ESIMD_UNSUPPORTED_ON_HOST;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam InternalOp is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the SIMD size of operation (the number of addresses to access)
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param src1 is the second atomic operand.
+template <typename Ty, int InternalOp, __ESIMD_NS::cache_hint L1H,
+          __ESIMD_NS::cache_hint L2H, uint16_t AddressScale, int ImmOffset,
+          __ESIMD_DNS::lsc_data_size DS, __ESIMD_DNS::lsc_vector_size VS,
+          __ESIMD_DNS::lsc_data_order Transposed, int N>
+__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()>
+__esimd_lsc_xatomic_slm_2(
+    __ESIMD_DNS::simd_mask_storage_t<N> pred,
+    __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
+    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> src0,
+    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> src1)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  __ESIMD_UNSUPPORTED_ON_HOST;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
 __ESIMD_INTRIN void __esimd_slm_init(uint32_t size)
 #ifdef __SYCL_DEVICE_ONLY__
     ;
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index b21293f87c522..e9e70f65c294b 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -3680,42 +3680,359 @@ lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
   }
 }
 
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param pred is predicate.
+///
+/// @return A vector of the old values at the memory locations before the
+///   update.
+
+template <atomic_op Op, typename T, int N, lsc_data_size DS>
+__ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
+slm_atomic_update_impl(simd<uint32_t, N> offsets, simd_mask<N> pred) {
+  check_lsc_data_size<T, DS>();
+  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
+  simd<MsgT, N> Tmp =
+      __esimd_lsc_xatomic_slm_0<MsgT, IOp, cache_hint::none, cache_hint::none,
+                                AddressScale, ImmOffset, EDS, VS, Transposed,
+                                N>(pred.data(), offsets.data());
+  return lsc_format_ret<T>(Tmp);
+}
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param pred is predicate.
+///
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename T, int N, lsc_data_size DS>
+__ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
+slm_atomic_update_impl(simd<uint32_t, N> offsets, simd<T, N> src0,
+                       simd_mask<N> pred) {
+  check_lsc_data_size<T, DS>();
+  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
+  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
+  simd<MsgT, N> Tmp =
+      __esimd_lsc_xatomic_slm_1<MsgT, IOp, cache_hint::none, cache_hint::none,
+                                AddressScale, ImmOffset, EDS, VS, Transposed,
+                                N>(pred.data(), offsets.data(),
+                                   Msg_data.data());
+  return lsc_format_ret<T>(Tmp);
+}
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand (expected value).
+/// @param src1 is the second atomic operand (new value).
+/// @param pred is predicates.
+///
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename T, int N, lsc_data_size DS>
+__ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
+                                              simd<T, N> src0, simd<T, N> src1,
+                                              simd_mask<N> pred) {
+  check_lsc_data_size<T, DS>();
+  check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
+  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
+  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
+  simd<MsgT, N> Tmp =
+      __esimd_lsc_xatomic_slm_2<MsgT, IOp, cache_hint::none, cache_hint::none,
+                                AddressScale, ImmOffset, EDS, VS, Transposed,
+                                N>(pred.data(), offsets.data(),
+                                   Msg_data0.data(), Msg_data1.data());
+  return lsc_format_ret<T>(Tmp);
+}
+
 } // namespace detail
 
-/// Atomic update operation performed on SLM. No source operands version.
-/// See description of template and function parameters in @ref
-/// usm_atomic_update0 "atomic update" operation docs.
-template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
-__ESIMD_API simd<Tx, N> slm_atomic_update(simd<uint32_t, N> offsets,
-                                          simd_mask<N> mask) {
-  detail::check_atomic<Op, T, N, 0>();
-  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
-  return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, offsets.data());
+/// @anchor slm_atomic_update0
+/// @brief Atomic update operation performed on SLM.
+/// No-argument variant of the atomic update operation.
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd_mask<N> mask = 1);                   /// (slm-au0-1)
+
+/// The following functions do the same work as slm_atomic_update(). They accept
+/// a local accessor \p lacc and the atomic update is done from SLM associated
+/// with \p lacc plus \p byte_offset applied to it. If \p byte_offset
+/// is omitted, then zero offset is used.
+
+/// simd<T, N> atomic_update(local_accessor lacc,
+///                          simd<uint32_t, N> byte_offset,
+///                          simd_mask<1> pred = 1);
+///                                                             // (lacc-au0-1)
+
+/// Usage of cache hints or non-standard operation width N requires DG2 or PVC.
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd_mask<N> mask = 1);                   /// (slm-au0-1)
+///
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation - can be \c atomic_op::inc or
+/// \c atomic_op::dec, \c atomic_op::load.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <atomic_op Op, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, simd_mask<N> mask = 1) {
+  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
+  // supported only by LSC.
+  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
+                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+    return slm_atomic_update_impl<Op, T, N,
+                                  detail::lsc_data_size::default_size>(
+        byte_offset, mask);
+  } else if constexpr (Op == atomic_op::load) {
+    if constexpr (std::is_integral_v<T>) {
+      return slm_atomic_update<atomic_op::bit_or, T, N>(byte_offset,
+                                                        simd<T, N>(0), mask);
+    } else {
+      using Tint = detail::uint_type_t<sizeof(T)>;
+      simd<Tint, N> Res = slm_atomic_update<atomic_op::bit_or, Tint, N>(
+          byte_offset, simd<Tint, N>(0), mask);
+      return Res.template bit_cast_view<T>();
+    }
+  } else {
+    detail::check_atomic<Op, T, N, 0>();
+    const auto si = get_surface_index(detail::LocalAccessorMarker());
+    return __esimd_dword_atomic0<Op, T, N>(mask.data(), si, byte_offset.data());
+  }
 }
 
-/// Atomic update operation performed on SLM. One source operands version.
-/// See description of template and function parameters in @ref
-/// usm_atomic_update1 "atomic update" operation docs.
-template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
-__ESIMD_API simd<Tx, N> slm_atomic_update(simd<uint32_t, N> offsets,
-                                          simd<Tx, N> src0, simd_mask<N> mask) {
-  detail::check_atomic<Op, T, N, 1>();
-  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
-  return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, offsets.data(),
-                                         src0.data());
-}
-
-/// Atomic update operation performed on SLM. Two source operands version.
-/// See description of template and function parameters in @ref
-/// usm_atomic_update2 "atomic update" operation docs.
-template <atomic_op Op, typename Tx, int N, class T = detail::__raw_t<Tx>>
-__ESIMD_API simd<Tx, N> slm_atomic_update(simd<uint32_t, N> offsets,
-                                          simd<Tx, N> src0, simd<Tx, N> src1,
-                                          simd_mask<N> mask) {
-  detail::check_atomic<Op, T, N, 2>();
-  const auto si = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
-  return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offsets.data(),
-                                         src0.data(), src1.data());
+/// simd<T, N> atomic_update(local_accessor lacc,
+///                          simd<uint32_t, N> byte_offset,
+///                          simd_mask<N> pred = 1);
+///                                                             // (lacc-au0-1)
+/// Atomically updates \c N memory locations in SLM ssociated
+/// with the local accessor \p lacc at the given \p byte_offset,
+/// and returns a vector of old values found at the memory locations before
+/// update.
+template <atomic_op Op, typename T, int N, typename AccessorT>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+        sycl::detail::acc_properties::is_local_accessor_v<AccessorT>,
+    simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset,
+              simd_mask<N> mask = 1) {
+  byte_offset += detail::localAccessorToOffset(lacc);
+  return slm_atomic_update<Op, T, N>(byte_offset, mask);
+}
+
+/// One argument variant of the atomic update operation.
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd<T, N> src0,
+///                   simd_mask<N> mask = 1);                   /// (slm-au1-1)
+///
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               simd<T, N> src0,
+///               simd_mask<1> pred = 1);                       // (lacc-au1-1)
+///
+
+/// Usage of cache hints or non-standard operation width N requires DG2 or PVC.
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd<T, N> src0,
+///                   simd_mask<N> mask = 1)                    /// (slm-au1-1)
+///
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
+                  simd_mask<N> mask = 1) {
+  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
+  // supported only by LSC.
+  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
+                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+    // half and short are supported in LSC.
+    return slm_atomic_update_impl<Op, T, N,
+                                  detail::lsc_data_size::default_size>(
+        byte_offset, src0, mask);
+  } else if constexpr (Op == atomic_op::store) {
+    if constexpr (std::is_integral_v<T>) {
+      return slm_atomic_update<atomic_op::xchg, T, N>(byte_offset, src0, mask);
+    } else {
+      using Tint = detail::uint_type_t<sizeof(T)>;
+      simd<Tint, N> Res = slm_atomic_update<atomic_op::xchg, Tint, N>(
+          byte_offset, src0.template bit_cast_view<Tint>(), mask);
+      return Res.template bit_cast_view<T>();
+    }
+  } else {
+    detail::check_atomic<Op, T, N, 1>();
+    const auto si = get_surface_index(detail::LocalAccessorMarker());
+    return __esimd_dword_atomic1<Op, T, N>(mask.data(), si, byte_offset.data(),
+                                           src0.data());
+  }
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               simd<T, N> src0,
+///               simd_mask<1> pred = 1);                       // (lacc-au1-1)
+///
+/// Atomically updates \c N memory locations in SLM indicated by
+/// local accessor \p lacc and a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename T, int N, typename AccessorT>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        sycl::detail::acc_properties::is_local_accessor_v<AccessorT>,
+    simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
+              simd_mask<N> mask = 1) {
+  byte_offset += detail::localAccessorToOffset(lacc);
+  return slm_atomic_update<Op, T, N>(byte_offset, src0, mask);
+}
+
+/// Two argument variant of the atomic update operation.
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd<T, N> src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);                   /// (slm-au2-1)
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               simd<T, N> src0,
+///               simd<T, N> src1,
+///               simd_mask<1> pred = 1);                      // (lacc-au2-1)
+///
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd<T, N> src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);                   /// (slm-au2-1)
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
+                  simd<T, N> src1, simd_mask<N> mask = 1) {
+  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
+  // supported only by LSC.
+  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
+                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+    // 2-argument lsc_atomic_update arguments order matches the standard one -
+    // expected value first, then new value. But atomic_update uses reverse
+    // order, hence the src1/src0 swap.
+    return detail::slm_atomic_update_impl<Op, T, N,
+                                          detail::lsc_data_size::default_size>(
+        byte_offset, src1, src0, mask);
+  } else {
+    detail::check_atomic<Op, T, N, 2>();
+    const auto si = get_surface_index(detail::LocalAccessorMarker());
+    return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, byte_offset.data(),
+                                           src0.data(), src1.data());
+  }
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               simd<T, N> src0,
+///               simd<T, N> src1,
+///               simd_mask<1> pred = 1);                      // (lacc-au2-1)
+template <atomic_op Op, typename T, int N, typename AccessorT>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        sycl::detail::acc_properties::is_local_accessor_v<AccessorT>,
+    simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
+  byte_offset += detail::localAccessorToOffset(lacc);
+  return slm_atomic_update<Op, T, N>(byte_offset, src0, src1, mask);
 }
 
 /// @} sycl_esimd_memory_slm
@@ -5489,49 +5806,6 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
   return atomic_update<Op, T, N>(acc, byte_offset, mask, props);
 }
 
-/// Variant of \c atomic_update that uses \c local_accessor as a parameter.
-/// Atomically updates \c N memory locations represented by an accessor and
-/// a vector of offsets, and returns a vector of old values found at the
-/// memory locations before update. The update operation has no arguments
-/// in addition to the value at the memory location.
-///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
-/// @param acc The SYCL accessor.
-/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
-/// @return A vector of the old values at the memory locations before the
-///   update.
-///
-template <atomic_op Op, typename T, int N, typename AccessorTy>
-__ESIMD_API __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 0 &&
-        sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
-    simd<T, N>>
-atomic_update(AccessorTy acc, simd<uint32_t, N> byte_offset,
-              simd_mask<N> mask) {
-  if constexpr (Op == atomic_op::load) {
-    if constexpr (std::is_integral_v<T>) {
-      return atomic_update<atomic_op::bit_or, T, N>(acc, byte_offset,
-                                                    simd<T, N>(0), mask);
-    } else {
-      using Tint = detail::uint_type_t<sizeof(T)>;
-      simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
-          acc, byte_offset, simd<Tint, N>(0), mask);
-      return Res.template bit_cast_view<T>();
-    }
-  } else {
-    return slm_atomic_update<Op, T, N>(
-        byte_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
-  }
-}
-
 /// simd<T, N>
 /// atomic_update(AccessorT acc, simd_view<Toffset, RegionTy> byte_offset,
 ///               simd_mask<N> mask, props = {});               /// (acc-au0-3)
@@ -5604,36 +5878,6 @@ atomic_update(AccessorTy acc, simd_view<Toffset, RegionTy> byte_offset,
   return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
 }
 
-/// A variation of \c atomic_update API with \c offsets represented as
-/// \c simd_view object.
-///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
-/// @param acc The SYCL accessor.
-/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
-/// 64-bit offsets are supported only when stateless memory accesses are
-/// enforced, i.e. accessor based accesses are automatically converted to
-/// stateless accesses.
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
-/// @return A vector of the old values at the memory locations before the
-///   update.
-///
-template <atomic_op Op, typename T, int N, typename Toffset,
-          typename AccessorTy, typename RegionTy = region1d_t<Toffset, N, 1>>
-__ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 0 && std::is_integral_v<Toffset> &&
-        !std::is_pointer_v<AccessorTy> &&
-        sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
-    simd<T, N>>
-atomic_update(AccessorTy acc, simd_view<Toffset, RegionTy> byte_offset,
-              simd_mask<N> mask) {
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask);
-}
-
 /// A variation of \c atomic_update API with \c offset represented as
 /// scalar.
 ///
@@ -5936,56 +6180,6 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
   return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), mask, props);
 }
 
-/// Variant of \c atomic_update that uses \c local_accessor as a parameter.
-/// Atomically updates \c N memory locations represented by an accessor and
-/// a vector of offsets, and returns a vector of old values found at the
-/// memory locations before update. The update operation has 1 additional
-/// argument.
-///
-/// @tparam Op The atomic operation - can be one of the following:
-/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
-/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
-/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
-/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
-/// atomic_op::fsub, \c atomic_op::store.
-/// @tparam Tx The vector element type.
-/// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
-/// @param acc The SYCL accessor.
-/// @param offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
-/// @param src0 The additional argument.
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
-/// @return A vector of the old values at the memory locations before the
-///   update.
-///
-template <atomic_op Op, typename Tx, int N, typename AccessorTy>
-__ESIMD_API std::enable_if_t<
-    sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>, simd<Tx, N>>
-atomic_update(AccessorTy acc, simd<uint32_t, N> offset, simd<Tx, N> src0,
-              simd_mask<N> mask) {
-  if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
-                (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
-    // Auto-convert FP atomics to LSC version.
-    return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(acc, offset,
-                                                                src0, mask);
-  } else if constexpr (Op == atomic_op::store) {
-    if constexpr (std::is_integral_v<Tx>) {
-      return atomic_update<atomic_op::xchg, Tx, N>(acc, offset, src0, mask);
-    } else {
-      using Tint = detail::uint_type_t<sizeof(Tx)>;
-      simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
-          acc, offset, src0.template bit_cast_view<Tint>(), mask);
-      return Res.template bit_cast_view<Tx>();
-    }
-  } else {
-    return slm_atomic_update<Op, Tx, N>(
-        offset + __ESIMD_DNS::localAccessorToOffset(acc), src0, mask);
-  }
-}
-
 /// simd<T, N>
 /// atomic_update(AccessorT acc, simd_view<Toffset, OffsetRegionTy> byte_offset,
 ///               simd<T, N> src0,
@@ -6023,6 +6217,7 @@ template <atomic_op Op, typename T, int N, typename Toffset,
               ext::oneapi::experimental::detail::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 1 && !std::is_pointer_v<AccessorTy> &&
+        !sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
 atomic_update(AccessorTy acc, simd_view<Toffset, RegionTy> byte_offset,
@@ -6065,6 +6260,7 @@ template <atomic_op Op, typename T, int N, typename Toffset,
               ext::oneapi::experimental::detail::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 1 && !std::is_pointer_v<AccessorTy> &&
+        !sycl::detail::acc_properties::is_local_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
 atomic_update(AccessorTy acc, simd_view<Toffset, RegionTy> byte_offset,
@@ -6414,43 +6610,6 @@ atomic_update(AccessorTy acc, simd_view<OffsetObjT, OffsetRegionTy> byte_offset,
                                  props);
 }
 
-/// Variant of \c atomic_update that uses \c local_accessor as a parameter.
-/// Atomically updates \c N memory locations represented by an accessor and
-/// a vector of offsets and returns a vector of old
-/// values found at the memory locations before update. The update operation
-/// has 2 additional arguments.
-///
-/// @tparam Op The atomic operation - can be one of the following:
-///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam Tx The vector element type.
-/// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
-/// @param acc The SYCL accessor.
-/// @param offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
-/// @param src0 The first additional argument (new value).
-/// @param src1 The second additional argument (expected value).
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
-/// @return A vector of the old values at the memory locations before the
-///   update.
-///
-template <atomic_op Op, typename Tx, int N, typename AccessorTy>
-__ESIMD_API std::enable_if_t<
-    sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>, simd<Tx, N>>
-atomic_update(AccessorTy acc, simd<uint32_t, N> offset, simd<Tx, N> src0,
-              simd<Tx, N> src1, simd_mask<N> mask) {
-  if constexpr (Op == atomic_op::fcmpxchg) {
-    // Auto-convert FP atomics to LSC version.
-    return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(
-        acc, offset, src0, src1, mask);
-  } else {
-    return slm_atomic_update<Op, Tx, N>(
-        offset + __ESIMD_DNS::localAccessorToOffset(acc), src0, src1, mask);
-  }
-}
-
 /// A variation of \c atomic_update API with \c offsets represented as
 /// scalar.
 ///
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
index f4eda3b4c98f6..36f726b38eed5 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
@@ -285,96 +285,17 @@ __esimd_lsc_store2d_stateless(__ESIMD_DNS::simd_mask_storage_t<N> Pred,
 }
 #endif // __SYCL_DEVICE_ONLY__
 
-/// SLM atomic.
+/// Memory fence.
 /// Supported platforms: DG2, PVC
 ///
-/// @tparam Ty is element type.
-/// @tparam InternalOp is operation type.
-/// @tparam L1H is L1 cache hint.
-/// @tparam L3H is L3 cache hint.
-/// @tparam AddressScale is the address scale.
-/// @tparam ImmOffset is the immediate offset added to each address.
-/// @tparam DS is the data size.
-/// @tparam VS is the number of elements per address.
-/// @tparam Transposed indicates if the data is transposed during the transfer.
-/// @tparam N is the SIMD size of operation (the number of addresses to access)
-/// @param pred is predicates.
-/// @param offsets is the zero-based offsets.
-template <typename Ty, int InternalOpOp, __ESIMD_ENS::cache_hint L1H,
-          __ESIMD_ENS::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
-          __ESIMD_ENS::lsc_data_size DS, __ESIMD_EDNS::lsc_vector_size VS,
-          __ESIMD_EDNS::lsc_data_order _Transposed, int N>
-__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
-__esimd_lsc_xatomic_slm_0(__ESIMD_DNS::simd_mask_storage_t<N> pred,
-                          __ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
-#ifdef __SYCL_DEVICE_ONLY__
-    ;
-#else  // __SYCL_DEVICE_ONLY__
-{
-  __ESIMD_UNSUPPORTED_ON_HOST;
-}
-#endif // __SYCL_DEVICE_ONLY__
-
-/// SLM atomic.
-/// Supported platforms: DG2, PVC
-///
-/// @tparam Ty is element type.
-/// @tparam InternalOp is operation type.
-/// @tparam L1H is L1 cache hint.
-/// @tparam L3H is L3 cache hint.
-/// @tparam AddressScale is the address scale.
-/// @tparam ImmOffset is the immediate offset added to each address.
-/// @tparam DS is the data size.
-/// @tparam VS is the number of elements per address.
-/// @tparam Transposed indicates if the data is transposed during the transfer.
-/// @tparam N is the SIMD size of operation (the number of addresses to access)
-/// @param pred is predicates.
-/// @param offsets is the zero-based offsets.
-/// @param src0 is the first atomic operand.
-template <typename Ty, int InternalOp, __ESIMD_ENS::cache_hint L1H,
-          __ESIMD_ENS::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
-          __ESIMD_ENS::lsc_data_size DS, __ESIMD_EDNS::lsc_vector_size VS,
-          __ESIMD_EDNS::lsc_data_order _Transposed, int N>
-__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
-__esimd_lsc_xatomic_slm_1(
-    __ESIMD_DNS::simd_mask_storage_t<N> pred,
-    __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
-    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> src0)
-#ifdef __SYCL_DEVICE_ONLY__
-    ;
-#else  // __SYCL_DEVICE_ONLY__
-{
-  __ESIMD_UNSUPPORTED_ON_HOST;
-}
-#endif // __SYCL_DEVICE_ONLY__
-
-/// SLM atomic.
-/// Supported platforms: DG2, PVC
-///
-/// @tparam Ty is element type.
-/// @tparam InternalOp is operation type.
-/// @tparam L1H is L1 cache hint.
-/// @tparam L3H is L3 cache hint.
-/// @tparam AddressScale is the address scale.
-/// @tparam ImmOffset is the immediate offset added to each address.
-/// @tparam DS is the data size.
-/// @tparam VS is the number of elements per address.
-/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam Kind is the Sfid shaded function.
+/// @tparam FenceOp is the fence operation.
+/// @tparam Scope is the operation scope.
 /// @tparam N is the SIMD size of operation (the number of addresses to access)
 /// @param pred is predicates.
-/// @param offsets is the zero-based offsets.
-/// @param src0 is the first atomic operand.
-/// @param src1 is the second atomic operand.
-template <typename Ty, int InternalOp, __ESIMD_ENS::cache_hint L1H,
-          __ESIMD_ENS::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
-          __ESIMD_ENS::lsc_data_size DS, __ESIMD_EDNS::lsc_vector_size VS,
-          __ESIMD_EDNS::lsc_data_order _Transposed, int N>
-__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
-__esimd_lsc_xatomic_slm_2(
-    __ESIMD_DNS::simd_mask_storage_t<N> pred,
-    __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
-    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> src0,
-    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> src1)
+template <__ESIMD_ENS::lsc_memory_kind Kind, __ESIMD_ENS::lsc_fence_op FenceOp,
+          __ESIMD_ENS::lsc_scope Scope, int N>
+__ESIMD_INTRIN void __esimd_lsc_fence(__ESIMD_DNS::simd_mask_storage_t<N> pred)
 #ifdef __SYCL_DEVICE_ONLY__
     ;
 #else  // __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 358af22ef00fb..e4cfca89f2953 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -2588,24 +2588,7 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N,
 __ESIMD_API __ESIMD_NS::simd<T, N>
 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
                       __ESIMD_NS::simd_mask<N> pred) {
-  static_assert(sizeof(T) == 2 || sizeof(T) == 4, "Unsupported data type");
-  __ESIMD_EDNS::check_lsc_vector_size<1>();
-  __ESIMD_EDNS::check_lsc_data_size<T, DS>();
-  __ESIMD_DNS::check_atomic<Op, T, N, 0>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  constexpr int IOp = detail::lsc_to_internal_atomic_op<T, Op>();
-  __ESIMD_NS::simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_slm_0<MsgT, IOp, cache_hint::none, cache_hint::none,
-                                _AddressScale, _ImmOffset, _DS, _VS,
-                                _Transposed, N>(pred.data(), offsets.data());
-  return detail::lsc_format_ret<T>(Tmp);
+  return __ESIMD_DNS::slm_atomic_update_impl<Op, T, N, DS>(offsets, pred);
 }
 
 /// SLM atomic.
@@ -2628,29 +2611,7 @@ __ESIMD_API __ESIMD_NS::simd<T, N>
 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
                       __ESIMD_NS::simd<T, N> src0,
                       __ESIMD_NS::simd_mask<N> pred) {
-  static_assert(Op != __ESIMD_NS::atomic_op::fadd &&
-                    Op != __ESIMD_NS::atomic_op::fsub,
-                "fadd and fsub are not supported for slm.");
-  static_assert(sizeof(T) == 2 || sizeof(T) == 4, "Unsupported data type");
-  detail::check_lsc_vector_size<1>();
-  detail::check_lsc_data_size<T, DS>();
-  __ESIMD_DNS::check_atomic<Op, T, N, 1>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  constexpr int IOp = detail::lsc_to_internal_atomic_op<T, Op>();
-  __ESIMD_NS::simd<MsgT, N> Msg_data = detail::lsc_format_input<MsgT>(src0);
-  __ESIMD_NS::simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_slm_1<MsgT, IOp, cache_hint::none, cache_hint::none,
-                                _AddressScale, _ImmOffset, _DS, _VS,
-                                _Transposed, N>(pred.data(), offsets.data(),
-                                                Msg_data.data());
-  return detail::lsc_format_ret<T>(Tmp);
+  return __ESIMD_DNS::slm_atomic_update_impl<Op, T, N, DS>(offsets, src0, pred);
 }
 
 /// SLM atomic.
@@ -2674,29 +2635,8 @@ __ESIMD_API __ESIMD_NS::simd<T, N>
 lsc_slm_atomic_update(__ESIMD_NS::simd<uint32_t, N> offsets,
                       __ESIMD_NS::simd<T, N> src0, __ESIMD_NS::simd<T, N> src1,
                       __ESIMD_NS::simd_mask<N> pred) {
-  static_assert(sizeof(T) == 2 || sizeof(T) == 4 ||
-                    (Op == __ESIMD_NS::atomic_op::cmpxchg && sizeof(T) == 8),
-                "Unsupported data type");
-  detail::check_lsc_vector_size<1>();
-  detail::check_lsc_data_size<T, DS>();
-  __ESIMD_DNS::check_atomic<Op, T, N, 2>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  constexpr int IOp = detail::lsc_to_internal_atomic_op<T, Op>();
-  __ESIMD_NS::simd<MsgT, N> Msg_data0 = detail::lsc_format_input<MsgT>(src0);
-  __ESIMD_NS::simd<MsgT, N> Msg_data1 = detail::lsc_format_input<MsgT>(src1);
-  __ESIMD_NS::simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_slm_2<MsgT, IOp, cache_hint::none, cache_hint::none,
-                                _AddressScale, _ImmOffset, _DS, _VS,
-                                _Transposed, N>(
-          pred.data(), offsets.data(), Msg_data0.data(), Msg_data1.data());
-  return detail::lsc_format_ret<T>(Tmp);
+  return __ESIMD_DNS::slm_atomic_update_impl<Op, T, N, DS>(offsets, src0, src1,
+                                                           pred);
 }
 
 /// USM pointer atomic.
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
new file mode 100644
index 0000000000000..e025d118d77fa
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
@@ -0,0 +1,756 @@
+//==-------atomic_update_slm.hpp - DPC++ ESIMD on-device test --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+
+constexpr int Signed = 1;
+constexpr int Unsigned = 2;
+
+constexpr int64_t threads_per_group = 8;
+constexpr int64_t n_groups = 1;
+constexpr int64_t start_ind = 3;
+constexpr int64_t masked_lane = 1;
+constexpr int64_t repeat = 1;
+constexpr int64_t stride = 4;
+
+// Helper functions
+
+const char *to_string(atomic_op op) {
+  switch (op) {
+  case atomic_op::add:
+    return "add";
+  case atomic_op::sub:
+    return "sub";
+  case atomic_op::inc:
+    return "inc";
+  case atomic_op::dec:
+    return "dec";
+  case atomic_op::umin:
+    return "umin";
+  case atomic_op::umax:
+    return "umax";
+  case atomic_op::xchg:
+    return "xchg";
+  case atomic_op::cmpxchg:
+    return "cmpxchg";
+  case atomic_op::bit_and:
+    return "bit_and";
+  case atomic_op::bit_or:
+    return "bit_or";
+  case atomic_op::bit_xor:
+    return "bit_xor";
+  case atomic_op::smin:
+    return "smin";
+  case atomic_op::smax:
+    return "smax";
+  case atomic_op::fmax:
+    return "fmax";
+  case atomic_op::fmin:
+    return "fmin";
+  case atomic_op::fadd:
+    return "fadd";
+  case atomic_op::fsub:
+    return "fsub";
+  case atomic_op::fcmpxchg:
+    return "fcmpxchg";
+  case atomic_op::load:
+    return "load";
+  case atomic_op::store:
+    return "store";
+  case atomic_op::predec:
+    return "predec";
+  }
+  return "<unknown>";
+}
+
+template <int N> inline bool any(simd_mask<N> m, simd_mask<N> ignore_mask) {
+  simd_mask<N> m1 = 0;
+  m.merge(m1, ignore_mask);
+  return m.any();
+}
+
+// The main test function
+
+template <class T, int N, template <class, int> class ImplF, bool UseMask>
+bool test_slm(queue q) {
+  constexpr auto op = ImplF<T, N>::atomic_op;
+  using CurAtomicOpT = decltype(op);
+  constexpr int n_args = ImplF<T, N>::n_args;
+
+  std::cout << "SLM testing" << " op=" << to_string(op)
+            << " T=" << esimd_test::type_name<T>() << " N=" << N << "\n\t"
+            << " UseMask=" << (UseMask ? "true" : "false")
+            << "{ thr_per_group=" << threads_per_group
+            << " n_groups=" << n_groups << " start_ind=" << start_ind
+            << " masked_lane=" << masked_lane << " repeat=" << repeat
+            << " stride=" << stride << " }...";
+
+  constexpr size_t size = start_ind + (N - 1) * stride + 1;
+  T *arr = malloc_shared<T>(size, q);
+  constexpr int n_threads = threads_per_group * n_groups;
+
+  for (int i = 0; i < size; ++i) {
+    arr[i] = ImplF<T, N>::init(i);
+  }
+
+  range<1> glob_rng(n_threads);
+  range<1> loc_rng(threads_per_group);
+  nd_range<1> rng(glob_rng, loc_rng);
+
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for(rng, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+        int i = ndi.get_global_id(0);
+        constexpr uint32_t SLMSize = size * sizeof(T);
+        slm_init<SLMSize>();
+
+        simd<uint32_t, N> offsets(start_ind * sizeof(T), stride * sizeof(T));
+        simd<T, size> data;
+        data.copy_from(arr);
+
+        if (ndi.get_local_id(0) == 0)
+          slm_block_store(0, data);
+
+        simd_mask<N> m = 1;
+        if constexpr (UseMask) {
+          if (masked_lane < N)
+            m[masked_lane] = 0;
+        }
+        // Intra-work group barrier.
+        barrier();
+
+        // the atomic operation itself applied in a loop:
+        for (int cnt = 0; cnt < repeat; ++cnt) {
+          if constexpr (n_args == 0) {
+            if constexpr (UseMask) {
+              slm_atomic_update<op, T>(offsets, m);
+            } else {
+              slm_atomic_update<op, T>(offsets);
+            }
+          } else if constexpr (n_args == 1) {
+            simd<T, N> v0 = ImplF<T, N>::arg0(i);
+            if constexpr (UseMask) {
+              slm_atomic_update<op, T>(offsets, v0, m);
+            } else {
+              slm_atomic_update<op, T>(offsets, v0);
+            }
+          } else if constexpr (n_args == 2) {
+            simd<T, N> new_val = ImplF<T, N>::arg0(i); // new value
+            simd<T, N> exp_val = ImplF<T, N>::arg1(i); // expected value
+            // do compare-and-swap in a loop until we get expected value;
+            // arg0 and arg1 must provide values which guarantee the loop
+            // is not endless:
+            if constexpr (UseMask) {
+              for (simd<T, N> old_val =
+                       slm_atomic_update<op, T>(offsets, new_val, exp_val, m);
+                   any(old_val < exp_val, !m);
+                   old_val =
+                       slm_atomic_update<op, T>(offsets, new_val, exp_val, m))
+                ;
+            } else {
+              for (simd<T, N> old_val =
+                       slm_atomic_update<op, T>(offsets, new_val, exp_val);
+                   any(old_val < exp_val, !m);
+                   old_val =
+                       slm_atomic_update<op, T>(offsets, new_val, exp_val))
+                ;
+            }
+          }
+        }
+        barrier();
+        if (ndi.get_local_id(0) == 0) {
+          auto data0 = slm_block_load<T, size>(0);
+          data0.copy_to(arr);
+        }
+      });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(arr, q);
+    return false;
+  }
+  int err_cnt = 0;
+
+  for (int i = 0; i < size; ++i) {
+    T gold = ImplF<T, N>::gold(i, UseMask);
+    T test = arr[i];
+
+    if ((gold != test) && (++err_cnt < 10)) {
+      if (err_cnt == 1) {
+        std::cout << "\n";
+      }
+      std::cout << "  failed at index " << i << ": " << test << " != " << gold
+                << "(gold)\n";
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  FAILED\n  pass rate: "
+              << ((float)(size - err_cnt) / (float)size) * 100.0f << "% ("
+              << (size - err_cnt) << "/" << size << ")\n";
+  } else {
+    std::cout << " passed\n";
+  }
+  free(arr, q);
+  return err_cnt == 0;
+}
+
+template <class T, int N, template <class, int> class ImplF, bool UseMask>
+bool test_slm_acc(queue q) {
+  constexpr auto op = ImplF<T, N>::atomic_op;
+  using CurAtomicOpT = decltype(op);
+  constexpr int n_args = ImplF<T, N>::n_args;
+
+  std::cout << "SLM ACC testing" << " op=" << to_string(op)
+            << " T=" << esimd_test::type_name<T>() << " N=" << N << "\n\t"
+            << " UseMask=" << (UseMask ? "true" : "false")
+            << "{ thr_per_group=" << threads_per_group
+            << " n_groups=" << n_groups << " start_ind=" << start_ind
+            << " masked_lane=" << masked_lane << " repeat=" << repeat
+            << " stride=" << stride << " }...";
+
+  constexpr size_t size = start_ind + (N - 1) * stride + 1;
+  T *arr = malloc_shared<T>(size, q);
+  constexpr int n_threads = threads_per_group * n_groups;
+
+  for (int i = 0; i < size; ++i) {
+    arr[i] = ImplF<T, N>::init(i);
+  }
+
+  range<1> glob_rng(n_threads);
+  range<1> loc_rng(threads_per_group);
+  nd_range<1> rng(glob_rng, loc_rng);
+
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      local_accessor<T, 1> LocalAcc(size, cgh);
+      cgh.parallel_for(rng, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
+        int i = NDI.get_global_id(0);
+        uint16_t LocalID = NDI.get_local_id(0);
+        simd<uint32_t, N> offsets(start_ind * sizeof(T), stride * sizeof(T));
+
+        if (LocalID == 0)
+          for (int I = 0; I < threads_per_group * N; I++)
+            LocalAcc[I] = arr[i * N + I];
+        barrier();
+
+        simd_mask<N> m = 1;
+        if constexpr (UseMask) {
+          if (masked_lane < N)
+            m[masked_lane] = 0;
+        }
+        // Intra-work group barrier.
+        barrier();
+
+        // the atomic operation itself applied in a loop:
+        for (int cnt = 0; cnt < repeat; ++cnt) {
+          if constexpr (n_args == 0) {
+            if constexpr (UseMask) {
+              atomic_update<op, T>(LocalAcc, offsets, m);
+            } else {
+              atomic_update<op, T>(LocalAcc, offsets);
+            }
+          } else if constexpr (n_args == 1) {
+            simd<T, N> v0 = ImplF<T, N>::arg0(i);
+            if constexpr (UseMask) {
+              atomic_update<op, T>(LocalAcc, offsets, v0, m);
+            } else {
+              atomic_update<op, T>(LocalAcc, offsets, v0);
+            }
+          } else if constexpr (n_args == 2) {
+            simd<T, N> new_val = ImplF<T, N>::arg0(i); // new value
+            simd<T, N> exp_val = ImplF<T, N>::arg1(i); // expected value
+            // do compare-and-swap in a loop until we get expected value;
+            // arg0 and arg1 must provide values which guarantee the loop
+            // is not endless:
+            if constexpr (UseMask) {
+              for (simd<T, N> old_val = atomic_update<op, T>(
+                       LocalAcc, offsets, new_val, exp_val, m);
+                   any(old_val < exp_val, !m);
+                   old_val = atomic_update<op, T>(LocalAcc, offsets, new_val,
+                                                  exp_val, m))
+                ;
+            } else {
+              for (simd<T, N> old_val = atomic_update<op, T>(LocalAcc, offsets,
+                                                             new_val, exp_val);
+                   any(old_val < exp_val, !m);
+                   old_val = atomic_update<op, T>(LocalAcc, offsets, new_val,
+                                                  exp_val))
+                ;
+            }
+          }
+        }
+        barrier();
+        if (LocalID == 0)
+          for (int I = 0; I < threads_per_group * N; I++)
+            arr[i * N + I] = LocalAcc[I];
+      });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(arr, q);
+    return false;
+  }
+  int err_cnt = 0;
+
+  for (int i = 0; i < size; ++i) {
+    T gold = ImplF<T, N>::gold(i, UseMask);
+    T test = arr[i];
+
+    if ((gold != test) && (++err_cnt < 10)) {
+      if (err_cnt == 1) {
+        std::cout << "\n";
+      }
+      std::cout << "  failed at index " << i << ": " << test << " != " << gold
+                << "(gold)\n";
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  FAILED\n  pass rate: "
+              << ((float)(size - err_cnt) / (float)size) * 100.0f << "% ("
+              << (size - err_cnt) << "/" << size << ")\n";
+  } else {
+    std::cout << " passed\n";
+  }
+  free(arr, q);
+  return err_cnt == 0;
+}
+
+// ----------------- Functions providing input and golden values for atomic
+// ----------------- operations.
+
+static int dense_ind(int ind, int VL) { return (ind - start_ind) / stride; }
+
+static bool is_updated(int ind, int VL, bool use_mask) {
+  if ((ind < start_ind) || (((ind - start_ind) % stride) != 0)) {
+    return false;
+  }
+  int ii = dense_ind(ind, VL);
+  bool res = true;
+  if (use_mask)
+    res = (ii % VL) != masked_lane;
+  return res;
+}
+
+// ----------------- Actual "traits" for each operation.
+
+template <class T, int N, class C, C Op> struct ImplIncBase {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 0;
+
+  static T init(int i) { return (T)0; }
+
+  static T gold(int i, bool use_mask) {
+    T gold = is_updated(i, N, use_mask)
+                 ? (T)(repeat * threads_per_group * n_groups)
+                 : init(i);
+    return gold;
+  }
+};
+
+template <class T, int N, class C, C Op> struct ImplDecBase {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 0;
+  static constexpr int base = 5;
+
+  static T init(int i) {
+    return (T)(repeat * threads_per_group * n_groups + base);
+  }
+
+  static T gold(int i, bool use_mask) {
+    T gold = is_updated(i, N, use_mask) ? (T)base : init(i);
+    return gold;
+  }
+};
+
+// The purpose of this is validate that floating point data is correctly
+// processed.
+constexpr float FPDELTA = 0.5f;
+
+template <class T, int N, class C, C Op> struct ImplLoadBase {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 0;
+
+  static T init(int i) { return (T)(i + FPDELTA); }
+
+  static T gold(int i, bool use_mask) {
+    T gold = init(i);
+    return gold;
+  }
+};
+
+template <class T, int N, class C, C Op> struct ImplStoreBase {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 1;
+  static constexpr T base = (T)(2 + FPDELTA);
+
+  static T init(int i) { return 0; }
+
+  static T gold(int i, bool use_mask) {
+    T gold = is_updated(i, N, use_mask) ? base : init(i);
+    return gold;
+  }
+
+  static T arg0(int i) { return base; }
+};
+
+template <class T, int N, class C, C Op> struct ImplAdd {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 1;
+
+  static T init(int i) { return 0; }
+
+  static T gold(int i, bool use_mask) {
+    T gold = is_updated(i, N, use_mask)
+                 ? (T)(repeat * threads_per_group * n_groups * (T)(1 + FPDELTA))
+                 : init(i);
+    return gold;
+  }
+
+  static T arg0(int i) { return (T)(1 + FPDELTA); }
+};
+
+template <class T, int N, class C, C Op> struct ImplSub {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 1;
+  static constexpr T base = (T)(5 + FPDELTA);
+
+  static T init(int i) {
+    return (T)(repeat * threads_per_group * n_groups * (T)(1 + FPDELTA) + base);
+  }
+
+  static T gold(int i, bool use_mask) {
+    T gold = is_updated(i, N, use_mask) ? base : init(i);
+    return gold;
+  }
+
+  static T arg0(int i) { return (T)(1 + FPDELTA); }
+};
+
+template <class T, int N, class C, C Op> struct ImplMin {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 1;
+
+  static T init(int i) { return std::numeric_limits<T>::max(); }
+
+  static T gold(int i, bool use_mask) {
+    T ExpectedFoundMin;
+    if constexpr (std::is_signed_v<T>)
+      ExpectedFoundMin = FPDELTA - (threads_per_group * n_groups - 1);
+    else
+      ExpectedFoundMin = FPDELTA;
+    T gold = is_updated(i, N, use_mask) ? ExpectedFoundMin : init(i);
+    return gold;
+  }
+
+  static T arg0(int i) {
+    int64_t sign = std::is_signed_v<T> ? -1 : 1;
+    return sign * i + FPDELTA;
+  }
+};
+
+template <class T, int N, class C, C Op> struct ImplMax {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 1;
+
+  static T init(int i) { return std::numeric_limits<T>::lowest(); }
+
+  static T gold(int i, bool use_mask) {
+    T ExpectedFoundMax = FPDELTA;
+    if constexpr (!std::is_signed_v<T>)
+      ExpectedFoundMax += threads_per_group * n_groups - 1;
+
+    T gold = is_updated(i, N, use_mask) ? ExpectedFoundMax : init(i);
+    return gold;
+  }
+
+  static T arg0(int i) {
+    int64_t sign = std::is_signed_v<T> ? -1 : 1;
+    return sign * i + FPDELTA;
+  }
+};
+
+template <class T, int N>
+struct ImplStore : ImplStoreBase<T, N, atomic_op, atomic_op::store> {};
+template <class T, int N>
+struct ImplLoad : ImplLoadBase<T, N, atomic_op, atomic_op::load> {};
+template <class T, int N>
+struct ImplInc : ImplIncBase<T, N, atomic_op, atomic_op::inc> {};
+template <class T, int N>
+struct ImplDec : ImplDecBase<T, N, atomic_op, atomic_op::dec> {};
+template <class T, int N>
+struct ImplIntAdd : ImplAdd<T, N, atomic_op, atomic_op::add> {};
+template <class T, int N>
+struct ImplIntSub : ImplSub<T, N, atomic_op, atomic_op::sub> {};
+template <class T, int N>
+struct ImplSMin : ImplMin<T, N, atomic_op, atomic_op::smin> {};
+template <class T, int N>
+struct ImplUMin : ImplMin<T, N, atomic_op, atomic_op::umin> {};
+template <class T, int N>
+struct ImplSMax : ImplMax<T, N, atomic_op, atomic_op::smax> {};
+template <class T, int N>
+struct ImplUMax : ImplMax<T, N, atomic_op, atomic_op::umax> {};
+
+template <class T, int N>
+struct ImplFadd : ImplAdd<T, N, atomic_op, atomic_op::fadd> {};
+template <class T, int N>
+struct ImplFsub : ImplSub<T, N, atomic_op, atomic_op::fsub> {};
+template <class T, int N>
+struct ImplLSCFmin : ImplMin<T, N, atomic_op, atomic_op::fmin> {};
+template <class T, int N>
+struct ImplLSCFmax : ImplMax<T, N, atomic_op, atomic_op::fmax> {};
+
+template <class T, int N, class C, C Op> struct ImplCmpxchgBase {
+  static constexpr C atomic_op = Op;
+  static constexpr int n_args = 2;
+  static constexpr T base = (T)(2 + FPDELTA);
+
+  static T init(int i) { return base - 1; }
+
+  static T gold(int i, bool use_mask) {
+    T gold = is_updated(i, N, use_mask)
+                 ? (T)(threads_per_group * n_groups - 1 + base)
+                 : init(i);
+    return gold;
+  }
+
+  // "Replacement value" argument in CAS
+  static inline T arg0(int i) { return i + base; }
+
+  // "Expected value" argument in CAS
+  static inline T arg1(int i) { return i + base - 1; }
+};
+
+template <class T, int N>
+struct ImplCmpxchg : ImplCmpxchgBase<T, N, atomic_op, atomic_op::cmpxchg> {};
+
+template <class T, int N>
+struct ImplLSCFcmpwr : ImplCmpxchgBase<T, N, atomic_op, atomic_op::fcmpxchg> {};
+
+// ----------------- Main function and test combinations.
+
+template <bool UseAcc, class T, int N, template <class, int> class ImplF,
+          bool UseMask>
+auto run_test(queue q) {
+  if constexpr (UseAcc) {
+    return test_slm_acc<T, N, ImplF, UseMask>(q);
+  } else {
+    return test_slm<T, N, ImplF, UseMask>(q);
+  }
+}
+
+template <int N, template <class, int> class Op, bool UseMask,
+          bool UsePVCFeatures, bool UseAcc, int SignMask = (Signed | Unsigned)>
+bool test_int_types(queue q) {
+  bool passed = true;
+  if constexpr (SignMask & Signed) {
+    if constexpr (UsePVCFeatures)
+      passed &= run_test<UseAcc, int16_t, N, Op, UseMask>(q);
+
+    passed &= run_test<UseAcc, int32_t, N, Op, UseMask>(q);
+
+    if constexpr (UsePVCFeatures) {
+      passed &= run_test<UseAcc, int64_t, N, Op, UseMask>(q);
+    }
+  }
+
+  if constexpr (SignMask & Unsigned) {
+    if constexpr (UsePVCFeatures)
+      passed &= run_test<UseAcc, uint16_t, N, Op, UseMask>(q);
+
+    passed &= run_test<UseAcc, uint32_t, N, Op, UseMask>(q);
+
+    if constexpr (UsePVCFeatures) {
+      passed &= run_test<UseAcc, uint64_t, N, Op, UseMask>(q);
+    }
+  }
+  return passed;
+}
+
+template <int N, template <class, int> class Op, bool UseMask,
+          bool UsePVCFeatures, bool UseAcc>
+bool test_fp_types(queue q) {
+  bool passed = true;
+  if constexpr (UsePVCFeatures) {
+    if constexpr (std::is_same_v<Op<sycl::half, N>,
+                                 ImplLSCFmax<sycl::half, N>> ||
+                  std::is_same_v<Op<sycl::half, N>,
+                                 ImplLSCFmin<sycl::half, N>> ||
+                  std::is_same_v<Op<sycl::half, N>,
+                                 ImplLSCFcmpwr<sycl::half, N>>) {
+      auto dev = q.get_device();
+      if (dev.has(sycl::aspect::fp16)) {
+        passed &= run_test<UseAcc, sycl::half, N, Op, UseMask>(q);
+      }
+    }
+  }
+
+  passed &= run_test<UseAcc, float, N, Op, UseMask>(q);
+
+  if constexpr (UsePVCFeatures) {
+    // TODO: fmin/max for double does not pass validation likely due to
+    // a driver bug. fcmpwr is hanging.
+    if constexpr (!std::is_same_v<Op<double, N>, ImplLSCFmax<double, N>> &&
+                  !std::is_same_v<Op<double, N>, ImplLSCFmin<double, N>> &&
+                  !std::is_same_v<Op<double, N>, ImplLSCFcmpwr<double, N>>) {
+      if (q.get_device().has(sycl::aspect::atomic64) &&
+          q.get_device().has(sycl::aspect::fp64)) {
+        passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
+      }
+    }
+  }
+  return passed;
+}
+
+template <template <class, int> class Op, bool UseMask, bool UsePVCFeatures,
+          bool UseAcc, int SignMask = (Signed | Unsigned)>
+bool test_int_types_and_sizes(queue q) {
+  bool passed = true;
+  passed &= test_int_types<1, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+  passed &= test_int_types<2, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+  passed &= test_int_types<4, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+  passed &= test_int_types<8, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+  // TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver.
+  if (UseMask && !UsePVCFeatures &&
+      esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows,
+                                "26918", "101.4953", false)) {
+    passed &=
+        test_int_types<16, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+    passed &=
+        test_int_types<32, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+  }
+
+  // Supported by LSC atomic:
+  if constexpr (UsePVCFeatures) {
+    passed &=
+        test_int_types<64, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+    // non power of two values are supported only in newer driver.
+    // TODO: Enable this when the new driver reaches test infrastructure
+    // (v27556).
+#if 0
+    passed &= test_int_types<12, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+    passed &= test_int_types<33, Op, UseMask, UsePVCFeatures, UseAcc, SignMask>(q);
+#endif
+  }
+
+  return passed;
+}
+
+template <template <class, int> class Op, bool UseMask, bool UsePVCFeatures,
+          bool UseAcc>
+bool test_fp_types_and_sizes(queue q) {
+  bool passed = true;
+  passed &= test_fp_types<1, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &= test_fp_types<2, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &= test_fp_types<4, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &= test_fp_types<8, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &= test_fp_types<16, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &= test_fp_types<32, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+
+  // Supported by LSC atomic:
+  if constexpr (UsePVCFeatures) {
+    passed &= test_fp_types<64, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+    // non power of two values are supported only in newer driver.
+    // TODO: Enable this when the new driver reaches test infrastructure
+    // (v27556).
+#if 0
+    passed &= test_fp_types<33, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+    passed &= test_fp_types<65, Op, UseMask, UsePVCFeatures, UseAcc>(q);
+#endif
+  }
+  return passed;
+}
+
+template <bool UseMask, bool UsePVCFeatures, bool UseAcc>
+int test_with_mask(queue q) {
+  bool passed = true;
+#ifndef CMPXCHG_TEST
+  passed &=
+      test_int_types_and_sizes<ImplInc, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &=
+      test_int_types_and_sizes<ImplDec, UseMask, UsePVCFeatures, UseAcc>(q);
+
+  passed &=
+      test_int_types_and_sizes<ImplIntAdd, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &=
+      test_int_types_and_sizes<ImplIntSub, UseMask, UsePVCFeatures, UseAcc>(q);
+
+  passed &= test_int_types_and_sizes<ImplSMax, UseMask, UsePVCFeatures, UseAcc,
+                                     Signed>(q);
+  passed &= test_int_types_and_sizes<ImplSMin, UseMask, UsePVCFeatures, UseAcc,
+                                     Signed>(q);
+
+  passed &= test_int_types_and_sizes<ImplUMax, UseMask, UsePVCFeatures, UseAcc,
+                                     Unsigned>(q);
+  passed &= test_int_types_and_sizes<ImplUMin, UseMask, UsePVCFeatures, UseAcc,
+                                     Unsigned>(q);
+
+  if constexpr (UsePVCFeatures) {
+    passed &=
+        test_fp_types_and_sizes<ImplLSCFmax, UseMask, UsePVCFeatures, UseAcc>(
+            q);
+    passed &=
+        test_fp_types_and_sizes<ImplLSCFmin, UseMask, UsePVCFeatures, UseAcc>(
+            q);
+
+    // TODO: fadd/fsub are emulated in the newer driver, but do not pass
+    // validation.
+#if 0
+    passed &= test_fp_types_and_sizes<ImplFadd, UseMask, UsePVCFeatures, UseAcc>(q);
+    passed &= test_fp_types_and_sizes<ImplFsub, UseMask, UsePVCFeatures, UseAcc>(q);
+#endif
+
+    // Check load/store operations.
+    passed &=
+        test_int_types_and_sizes<ImplLoad, UseMask, UsePVCFeatures, UseAcc>(q);
+    passed &=
+        test_int_types_and_sizes<ImplStore, UseMask, UsePVCFeatures, UseAcc>(q);
+    passed &=
+        test_fp_types_and_sizes<ImplStore, UseMask, UsePVCFeatures, UseAcc>(q);
+  }
+#else
+  passed &=
+      test_int_types_and_sizes<ImplCmpxchg, UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &=
+      test_fp_types_and_sizes<ImplLSCFcmpwr, UseMask, UsePVCFeatures, UseAcc>(
+          q);
+#endif
+  return passed;
+}
+
+template <bool UsePVCFeatures> bool test_main(queue q) {
+  bool passed = true;
+
+  constexpr const bool UseMask = true;
+  constexpr const bool UseAcc = true;
+
+  passed &= test_with_mask<UseMask, UsePVCFeatures, !UseAcc>(q);
+  passed &= test_with_mask<!UseMask, UsePVCFeatures, !UseAcc>(q);
+
+  return passed;
+}
+
+template <bool UsePVCFeatures> bool test_main_acc(queue q) {
+  bool passed = true;
+
+  constexpr const bool UseMask = true;
+  constexpr const bool UseAcc = true;
+
+  passed &= test_with_mask<UseMask, UsePVCFeatures, UseAcc>(q);
+  passed &= test_with_mask<!UseMask, UsePVCFeatures, UseAcc>(q);
+
+  return passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm.cpp
new file mode 100644
index 0000000000000..c23bace7cc858
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm.cpp
@@ -0,0 +1,23 @@
+//==------- atomic_update_slm.cpp - DPC++ ESIMD on-device test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include "Inputs/atomic_update_slm.hpp"
+
+int main(void) {
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+  esimd_test::printTestLabel(q);
+
+  constexpr bool TestCacheHintProperties = true;
+  bool passed = test_main<!TestCacheHintProperties>(q);
+
+  std::cout << (passed ? "Passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc.cpp
new file mode 100644
index 0000000000000..37246aefae08a
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc.cpp
@@ -0,0 +1,24 @@
+//==--- atomic_update_slm_acc.cpp - DPC++ ESIMD on-device test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES-INTEL-DRIVER: lin: 26918, win: 101.4953
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include "Inputs/atomic_update_slm.hpp"
+
+int main(void) {
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+  esimd_test::printTestLabel(q);
+
+  constexpr bool TestCacheHintProperties = true;
+  bool passed = test_main_acc<!TestCacheHintProperties>(q);
+
+  std::cout << (passed ? "Passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_cmpxchg.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_cmpxchg.cpp
new file mode 100644
index 0000000000000..398b8d6f0c3d2
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_cmpxchg.cpp
@@ -0,0 +1,14 @@
+//==----- atomic_update_slm_acc_cmpxchg.cpp - DPC++ ESIMD on-device test  --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES-INTEL-DRIVER: lin: 26918, win: 101.4953
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#define CMPXCHG_TEST
+
+#include "atomic_update_slm_acc.cpp"
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc.cpp
new file mode 100644
index 0000000000000..bbf24febeb9e6
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc.cpp
@@ -0,0 +1,26 @@
+//==------- atomic_update_slm_acc_pvc.cpp - DPC++ ESIMD on-device test -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: gpu-intel-pvc
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include "Inputs/atomic_update_slm.hpp"
+
+int main(void) {
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+  esimd_test::printTestLabel(q);
+
+  constexpr bool TestCacheHintProperties = true;
+  bool passed = test_main_acc<TestCacheHintProperties>(q);
+
+  std::cout << (passed ? "Passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc_cmpxchg.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc_cmpxchg.cpp
new file mode 100644
index 0000000000000..d5b6a3ab530c3
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_acc_pvc_cmpxchg.cpp
@@ -0,0 +1,16 @@
+//==- atomic_update_slm_acc_pvc_cmpxchg.cpp -- DPC++ ESIMD on-device test --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: gpu-intel-pvc
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#define CMPXCHG_TEST
+
+#include "atomic_update_slm_acc_pvc.cpp"
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_cmpxchg.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_cmpxchg.cpp
new file mode 100644
index 0000000000000..b4fecb37d2307
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_cmpxchg.cpp
@@ -0,0 +1,14 @@
+//==------- atomic_update_slm_cmpxchg.cpp - DPC++ ESIMD on-device test  ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES-INTEL-DRIVER: lin: 26918, win: 101.4953
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#define CMPXCHG_TEST
+
+#include "atomic_update_slm.cpp"
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc.cpp
new file mode 100644
index 0000000000000..7639182abe83f
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc.cpp
@@ -0,0 +1,26 @@
+//==------- atomic_update_slm_pvc.cpp - DPC++ ESIMD on-device test ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: gpu-intel-pvc
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include "Inputs/atomic_update_slm.hpp"
+
+int main(void) {
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+  esimd_test::printTestLabel(q);
+
+  constexpr bool TestCacheHintProperties = true;
+  bool passed = test_main<TestCacheHintProperties>(q);
+
+  std::cout << (passed ? "Passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc_cmpxchg.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc_cmpxchg.cpp
new file mode 100644
index 0000000000000..bc8665f2ec1d1
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_slm_pvc_cmpxchg.cpp
@@ -0,0 +1,16 @@
+//==----- atomic_update_slm_pvc_cmpxchg.cpp -- DPC++ ESIMD on-device test --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: gpu-intel-pvc
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#define CMPXCHG_TEST
+
+#include "atomic_update_slm_pvc.cpp"
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 6455e11d99747..41cc559764e2e 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -12,13 +12,15 @@ using namespace sycl::ext::intel::esimd;
 
 using AccType = sycl::accessor<uint8_t, 1, sycl::access::mode::read_write>;
 using LocalAccType = sycl::local_accessor<double, 1>;
+using LocalAccTypeInt = sycl::local_accessor<int, 1>;
 
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_block_load(AccType &,
                                                        LocalAccType &, float *,
                                                        int byte_offset32,
                                                        size_t byte_offset64);
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
-test_atomic_update(AccType &, float *, int byte_offset32, size_t byte_offset64);
+test_atomic_update(AccType &, LocalAccTypeInt &, float *, int byte_offset32,
+                   size_t byte_offset64);
 
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
 test_block_store(AccType &, LocalAccType &local_acc, float *, int byte_offset32,
@@ -28,12 +30,13 @@ class EsimdFunctor {
 public:
   AccType acc;
   LocalAccType local_acc;
+  LocalAccTypeInt local_acc_int;
   float *ptr;
   int byte_offset32;
   size_t byte_offset64;
   void operator()() __attribute__((sycl_explicit_simd)) {
     test_block_load(acc, local_acc, ptr, byte_offset32, byte_offset64);
-    test_atomic_update(acc, ptr, byte_offset32, byte_offset64);
+    test_atomic_update(acc, local_acc_int, ptr, byte_offset32, byte_offset64);
     test_block_store(acc, local_acc, ptr, byte_offset32, byte_offset64);
   }
 };
@@ -43,9 +46,10 @@ __attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
   kernelFunc();
 }
 
-void bar(AccType &acc, LocalAccType &local_acc, float *ptr, int byte_offset32,
-         size_t byte_offset64) {
-  EsimdFunctor esimdf{acc, local_acc, ptr, byte_offset32, byte_offset64};
+void bar(AccType &acc, LocalAccType &local_acc, LocalAccTypeInt &local_acc_int,
+         float *ptr, int byte_offset32, size_t byte_offset64) {
+  EsimdFunctor esimdf{acc, local_acc,     local_acc_int,
+                      ptr, byte_offset32, byte_offset64};
   kernel<class kernel_esimd>(esimdf);
 }
 
@@ -220,8 +224,8 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
 
 // CHECK-LABEL: define {{.*}} @_Z18test_atomic_update{{.*}}
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
-test_atomic_update(AccType &acc, float *ptrf, int byte_offset32,
-                   size_t byte_offset64) {
+test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
+                   int byte_offset32, size_t byte_offset64) {
   constexpr int VL = 4;
   int *ptr = 0;
   uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
@@ -246,7 +250,6 @@ test_atomic_update(AccType &acc, float *ptrf, int byte_offset32,
   // Test atomic update with no operands.
   {
     // USM
-
     // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_0 =
         atomic_update<atomic_op::inc, int>(ptr, offsets, pred, props_a);
@@ -566,6 +569,219 @@ test_atomic_update(AccType &acc, float *ptrf, int byte_offset32,
     auto res_atomic_33 = atomic_update<atomic_op::cmpxchg, int, VL>(
         acc, offsets, swap, compare, pred);
   }
+
+  // Test slm_atomic_update without operands.
+  {
+    // CHECK-COUNT-4: call <4 x i32> @llvm.genx.dword.atomic.dec.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    {
+      auto res_slm_atomic_0 =
+          slm_atomic_update<atomic_op::dec, int>(offsets, pred);
+      auto res_slm_atomic_1 = slm_atomic_update<atomic_op::dec, int>(offsets);
+      auto res_slm_atomic_2 =
+          slm_atomic_update<atomic_op::dec, int, VL>(offsets_view, pred);
+      auto res_slm_atomic_3 =
+          slm_atomic_update<atomic_op::dec, int, VL>(offsets_view);
+    }
+
+    // Expect DWORD for load.
+    // CHECK: call <4 x i32> @llvm.genx.dword.atomic.or.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    auto res_slm_atomic_4 =
+        slm_atomic_update<atomic_op::load, int>(offsets, pred);
+
+    // Expect LSC for short.
+    {
+      constexpr int VL = 8;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto pred = simd_mask<VL>(1);
+
+      // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.slm.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 10, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> undef, i32 0, <8 x i32> undef)
+      auto res_slm_atomic_0 =
+          slm_atomic_update<atomic_op::load, int16_t>(offsets, pred);
+    }
+  }
+
+  // Test slm_atomic_update with one operand.
+  {
+    // CHECK-COUNT-8: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    {
+      auto res_slm_atomic_1 =
+          slm_atomic_update<atomic_op::add>(offsets, add, pred);
+      auto res_slm_atomic_2 = slm_atomic_update<atomic_op::add>(offsets, add);
+      auto res_slm_atomic_3 =
+          slm_atomic_update<atomic_op::add, int, VL>(offsets, add_view, pred);
+      auto res_slm_atomic_4 =
+          slm_atomic_update<atomic_op::add, int, VL>(offsets, add_view);
+      auto res_slm_atomic_5 =
+          slm_atomic_update<atomic_op::add, int, VL>(offsets_view, add, pred);
+      auto res_slm_atomic_6 =
+          slm_atomic_update<atomic_op::add, int, VL>(offsets_view, add);
+      auto res_slm_atomic_7 = slm_atomic_update<atomic_op::add, int, VL>(
+          offsets_view, add_view, pred);
+      auto res_slm_atomic_8 =
+          slm_atomic_update<atomic_op::add, int, VL>(offsets_view, add_view);
+    }
+
+    // Expect LSC for short.
+    {
+      constexpr int VL = 16;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto pred = simd_mask<VL>(1);
+      simd<int16_t, VL> add = simd<int16_t, VL>(1) * sizeof(int);
+
+      // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> undef, i32 0, <16 x i32> undef)
+      auto res_slm_atomic_0 =
+          slm_atomic_update<atomic_op::add, int16_t>(offsets, add, pred);
+    }
+    // Expect DWORD for fmin.
+    {
+      constexpr int VL = 16;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(float);
+      auto pred = simd_mask<VL>(1);
+      simd<float, VL> min = simd<float, VL>(1) * sizeof(int);
+
+      // CHECK: call <16 x float> @llvm.genx.dword.atomic.fmin.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i32 {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef)
+      auto res_slm_atomic_0 =
+          slm_atomic_update<atomic_op::fmin, float>(offsets, min, pred);
+    }
+    // Expect LSC for half.
+    {
+      constexpr int VL = 16;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(sycl::half);
+      auto pred = simd_mask<VL>(1);
+      simd<sycl::half, VL> min = simd<sycl::half, VL>(1) * sizeof(int);
+
+      // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 21, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> undef, i32 0, <16 x i32> undef)
+      auto res_slm_atomic_0 =
+          slm_atomic_update<atomic_op::fmin, sycl::half>(offsets, min, pred);
+    }
+  }
+
+  // Test slm_atomic_update with two operands.
+  {
+    // CHECK-COUNT-16: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    auto res_atomic_1 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap, compare, pred);
+    auto res_atomic_2 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap, compare);
+
+    auto res_atomic_3 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets, swap, compare_view, pred);
+    auto res_atomic_4 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets, swap, compare_view);
+
+    auto res_atomic_5 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets, swap_view, compare, pred);
+    auto res_atomic_6 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets, swap_view, compare);
+
+    auto res_atomic_7 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets, swap_view, compare_view, pred);
+    auto res_atomic_8 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets, swap_view, compare_view);
+
+    auto res_atomic_9 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap, compare, pred);
+    auto res_atomic_10 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap, compare);
+
+    auto res_atomic_11 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap, compare_view, pred);
+    auto res_atomic_12 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap, compare_view);
+
+    auto res_atomic_13 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap_view, compare, pred);
+    auto res_atomic_14 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap_view, compare);
+
+    auto res_atomic_15 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap_view, compare_view, pred);
+    auto res_atomic_16 = slm_atomic_update<atomic_op::cmpxchg, int, VL>(
+        offsets_view, swap_view, compare_view);
+
+    // Expect LSC for int64_t.
+    {
+      constexpr int VL = 16;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int64_t);
+      auto compare = simd<int64_t, VL>(VL, 1);
+      auto swap = compare * 2;
+      auto pred = simd_mask<VL>(1);
+
+      // CHECK: call <16 x i64> @llvm.genx.lsc.xatomic.slm.v16i64.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 4, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i64> {{[^)]+}}, <16 x i64> {{[^)]+}}, i32 0, <16 x i64> undef)
+      auto res_slm_atomic_0 = slm_atomic_update<atomic_op::cmpxchg, int64_t>(
+          offsets, swap, compare, pred);
+    }
+  }
+
+  // Test with local accessor.
+  // Zero operand atomic.
+  // CHECK-COUNT-4: call <4 x i32> @llvm.genx.dword.atomic.inc.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+  {
+    auto res_slm_atomic_1 =
+        atomic_update<atomic_op::inc, int>(local_acc, offsets, pred);
+    auto res_slm_atomic_2 =
+        atomic_update<atomic_op::inc, int, VL>(local_acc, offsets);
+    auto res_slm_atomic_3 =
+        atomic_update<atomic_op::inc, int, VL>(local_acc, offsets_view, pred);
+    auto res_slm_atomic_4 =
+        atomic_update<atomic_op::inc, int, VL>(local_acc, offsets_view);
+  }
+  // One operand atomic.
+  {
+    // CHECK-COUNT-8: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    auto res_slm_atomic_1 =
+        atomic_update<atomic_op::add>(local_acc, offsets, add, pred);
+    auto res_slm_atomic_2 =
+        atomic_update<atomic_op::add>(local_acc, offsets, add);
+    auto res_slm_atomic_3 = atomic_update<atomic_op::add, int, VL>(
+        local_acc, offsets, add_view, pred);
+    auto res_slm_atomic_4 =
+        atomic_update<atomic_op::add, int, VL>(local_acc, offsets, add_view);
+    auto res_slm_atomic_5 = atomic_update<atomic_op::add, int, VL>(
+        local_acc, offsets_view, add, pred);
+    auto res_slm_atomic_6 =
+        atomic_update<atomic_op::add, int, VL>(local_acc, offsets_view, add);
+    auto res_slm_atomic_7 = atomic_update<atomic_op::add, int, VL>(
+        local_acc, offsets_view, add_view, pred);
+    auto res_slm_atomic_8 = atomic_update<atomic_op::add, int, VL>(
+        local_acc, offsets_view, add_view);
+  }
+  // Two operand atomic.
+  {
+    // CHECK-COUNT-16: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    auto res_slm_atomic_1 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap, compare, pred);
+    auto res_slm_atomic_2 =
+        atomic_update<atomic_op::cmpxchg>(local_acc, offsets, swap, compare);
+    auto res_slm_atomic_3 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets, swap, compare_view, pred);
+    auto res_slm_atomic_4 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets, swap, compare_view);
+    auto res_slm_atomic_5 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets, swap_view, compare, pred);
+    auto res_slm_atomic_6 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets, swap_view, compare);
+    auto res_slm_atomic_7 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets, swap_view, compare_view, pred);
+    auto res_slm_atomic_8 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets, swap_view, compare_view);
+    auto res_slm_atomic_9 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap, compare, pred);
+    auto res_slm_atomic_10 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap, compare);
+    auto res_slm_atomic_11 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap, compare_view, pred);
+    auto res_slm_atomic_12 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap, compare_view);
+    auto res_slm_atomic_13 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap_view, compare, pred);
+    auto res_slm_atomic_14 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap_view, compare);
+    auto res_slm_atomic_15 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap_view, compare_view, pred);
+    auto res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg, int, VL>(
+        local_acc, offsets_view, swap_view, compare_view);
+  }
 }
 
 // CHECK-LABEL: define {{.*}} @_Z16test_block_store{{.*}}