From 5b3ba9fd2bddacab5a13e4f3956eaf516c1d08e9 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Mon, 5 Feb 2024 08:30:13 -0800
Subject: [PATCH 1/8] [SYCL] Treat "ar" image format as
 PI_DEVICE_BINARY_TYPE_NATIVE (#12587)

That's what AOT for Intel GPUs produces when targeting multiple devices
at once.

This PR is built on top of https://github.com/intel/llvm/pull/12586.
---
 sycl/source/detail/pi.cpp                     |  8 ++
 .../KernelAndProgram/test_cache_jit_aot.cpp   | 89 +++++++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 sycl/test-e2e/KernelAndProgram/test_cache_jit_aot.cpp

diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp
index 4dbeaccc9baf2..bbee0c1131feb 100644
--- a/sycl/source/detail/pi.cpp
+++ b/sycl/source/detail/pi.cpp
@@ -718,6 +718,14 @@ getBinaryImageFormat(const unsigned char *ImgData, size_t ImgSize) {
       return PI_DEVICE_BINARY_TYPE_NATIVE;
   }
 
+  if (MatchMagicNumber(std::array{'!', '<', 'a', 'r', 'c', 'h', '>', '\n'}))
+    // "ar" format is used to pack binaries for multiple devices, e.g. via
+    //
+    //   -Xsycl-target-backend=spir64_gen "-device acm-g10,acm-g11"
+    //
+    // option.
+    return PI_DEVICE_BINARY_TYPE_NATIVE;
+
   return PI_DEVICE_BINARY_TYPE_NONE;
 }
 
diff --git a/sycl/test-e2e/KernelAndProgram/test_cache_jit_aot.cpp b/sycl/test-e2e/KernelAndProgram/test_cache_jit_aot.cpp
new file mode 100644
index 0000000000000..856d1510edfc5
--- /dev/null
+++ b/sycl/test-e2e/KernelAndProgram/test_cache_jit_aot.cpp
@@ -0,0 +1,89 @@
+// Don't use normal %{run} as we need to control cache directory removal and
+// cannot do that reliably when number of devices is unknown.
+//
+// REQUIRES: level_zero, ocloc
+//
+// DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=1 SYCL_CACHE_DIR=%t/cache_dir
+// DEFINE: %{build_cmd} = %{build}
+// RUN: mkdir -p %t/cache_dir
+//
+// The following block of code should be copy-pasted as-is to verify different
+// JIT/AOT options. Don't know how to avoid code duplication.
+// ******************************
+// Check the logs first.
+// RUN: %{build_cmd} -DVALUE=1 -o %t.out
+// RUN: rm -rf %t/cache_dir/*
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s %if windows %{ --check-prefixes=CHECK,CHECK-WIN %}
+//
+// Now try to substitute the cached image and verify it is actually taken and
+// the code/binary there is executed.
+// RUN: mv %t/cache_dir/*/*/*/*/*.bin %t.value1.bin
+// RUN: rm -rf %t/cache_dir/*
+// RUN: %{build_cmd} -DVALUE=2 -o %t.out
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT2
+// RUN: mv %t.value1.bin %t/cache_dir/*/*/*/*/*.bin
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT1
+// ******************************
+//
+// REDEFINE: %{build_cmd} = %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device acm-g10" %s
+// ******************************
+// Check the logs first.
+// RUN: %{build_cmd} -DVALUE=1 -o %t.out
+// RUN: rm -rf %t/cache_dir/*
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s %if windows %{ --check-prefixes=CHECK,CHECK-WIN %}
+//
+// Now try to substitute the cached image and verify it is actually taken and
+// the code/binary there is executed.
+// RUN: mv %t/cache_dir/*/*/*/*/*.bin %t.value1.bin
+// RUN: rm -rf %t/cache_dir/*
+// RUN: %{build_cmd} -DVALUE=2 -o %t.out
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT2
+// RUN: mv %t.value1.bin %t/cache_dir/*/*/*/*/*.bin
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT1
+// ******************************
+//
+// REDEFINE: %{build_cmd} = %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device acm-g10,acm-g11" %s
+// ******************************
+// Check the logs first.
+// RUN: %{build_cmd} -DVALUE=1 -o %t.out
+// RUN: rm -rf %t/cache_dir/*
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s %if windows %{ --check-prefixes=CHECK,CHECK-WIN %}
+//
+// Now try to substitute the cached image and verify it is actually taken and
+// the code/binary there is executed.
+// RUN: mv %t/cache_dir/*/*/*/*/*.bin %t.value1.bin
+// RUN: rm -rf %t/cache_dir/*
+// RUN: %{build_cmd} -DVALUE=2 -o %t.out
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT2
+// RUN: mv %t.value1.bin %t/cache_dir/*/*/*/*/*.bin
+// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT1
+// ******************************
+
+// CHECK:     Code caching: device binary has been cached: [[BIN_FILE:.*]]
+// CHECK-WIN: Code caching: using cached device binary: [[BIN_FILE]]
+// CHECK-WIN: Code caching: using cached device binary: [[BIN_FILE]]
+
+// RESULT1: Result (0): 1
+// RESULT1: Result (1): 1
+// RESULT1: Result (2): 1
+
+// RESULT2: Result (0): 2
+// RESULT2: Result (1): 2
+// RESULT2: Result (2): 2
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  for (int i = 0; i < 3; ++i) {
+    sycl::buffer<int, 1> b{1};
+    sycl::queue{}
+        .submit([&](sycl::handler &cgh) {
+          sycl::accessor acc{b, cgh};
+          cgh.single_task([=]() { acc[0] = VALUE; });
+        })
+        .wait();
+    std::cout << "Result (" << i << "): " << sycl::host_accessor{b}[0]
+              << std::endl;
+  }
+  return 0;
+}

From 62c86d74024203c14e7c6d9c619dfd26c58bcc37 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Mon, 5 Feb 2024 09:52:49 -0800
Subject: [PATCH 2/8] [CI] Try to make OCL CPU task work on Intel GEN12 runner
 in Nightly (#12608)

---
 .github/workflows/sycl-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml
index 0b464c75d33b6..3a741cdb221d7 100644
--- a/.github/workflows/sycl-nightly.yml
+++ b/.github/workflows/sycl-nightly.yml
@@ -57,7 +57,7 @@ jobs:
           - name: OCL CPU (Intel/GEN12)
             runner: '["Linux", "gen12"]'
             image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
-            image_options: -u 1001
+            image_options: -u 1001 --privileged --cap-add SYS_ADMIN
             target_devices: opencl:cpu
 
           - name: OCL CPU (Intel/Arc)

From d286f4ab1ca89fcc511a9070e4e74b180e9cfa12 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Mon, 5 Feb 2024 11:09:46 -0800
Subject: [PATCH 3/8] [SYCL][ESIMD]Implement gather(lacc) accepting compile
 time properties (#12533)

This implements the unified memory API for gather with local accessors

---------

Co-authored-by: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 317 +++++++++++++++++-
 .../unified_memory_api/Inputs/gather.hpp      | 253 +++++++++++++-
 .../ESIMD/unified_memory_api/gather_lacc.cpp  |  37 ++
 .../gather_lacc_dg2_pvc.cpp                   |  40 +++
 sycl/test/esimd/memory_properties.cpp         |  78 ++++-
 5 files changed, 717 insertions(+), 8 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 2ecc96a122317..d923821027339 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -3043,7 +3043,7 @@ gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
 /// simd<T, N> gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
 ///                   simd_mask<N / VS> mask,
 ///                   PropertyListT props = {});                   // (acc-ga-2)
-/// Supported platforms: DG2, PVC in most cases. The DG2/PVC is not required if
+/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if
 /// VS == 1 and no L1/L2 cache hints used and sizeof(T) <= 4 and N = {1,8,16,32}
 ///
 /// Loads ("gathers") elements of the type 'T' from memory locations addressed
@@ -3111,7 +3111,7 @@ gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
 ///           typename PropertyListT = empty_properties_t>
 /// simd<T, N> gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
 ///                   PropertyListT props = {});                   // (acc-ga-3)
-/// Supported platforms: DG2, PVC in most cases. The DG2/PVC is not required if
+/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if
 /// VS == 1 and no L1/L2 cache hints used and sizeof(T) <= 4 and N = {1,8,16,32}
 ///
 /// Loads ("gathers") elements of the type 'T' from memory locations addressed
@@ -7389,6 +7389,317 @@ __ESIMD_API
                         flags);
 }
 
+/// Variant of gather that uses local accessor as a parameter
+/// template <typename T, int N, int VS, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+///                   simd_mask<N / VS> mask, simd<T, N> pass_thru,
+///                   PropertyListT props = {});                 // (lacc-ga-1)
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+///                   simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                 // (lacc-ga-2)
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+///                   PropertyListT props = {});                 // (lacc-ga-3)
+///
+/// The next 3 functions are similar to (lacc-ga-1,2,3), but they don't have
+/// the template parameter 'VS'. These functions are added for convenience and
+/// to make it possible for the user to omit the template parameters T and N,
+/// e.g. 'auto res = gather(acc, byte_offsets);
+/// template <typename T, int N, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
+///                   simd_mask<N> mask, simd<T, N> pass_thru,
+///                   PropertyListT props = {});                 // (lacc-ga-4)
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
+///                   simd_mask<N> mask, PropertyListT props = {});//(lacc-ga-5)
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
+///                   PropertyListT props = {});                // (lacc-ga-6)
+///
+/// The next 3 functions are similar to (lacc-ga-1,2,3), but accept the
+/// \p byte_offsets as a \c simd_view argument:
+/// template <typename T, int N, int VS = 1, typename AccessorT,
+///           typename OffsetSimdViewT,
+//            typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, OffsetSimdViewT byte_offsets,
+///                   simd_mask<N / VS> mask, simd<T, N> pass_thru,
+///                   PropertyListT props = {});                 // (lacc-ga-7)
+/// simd<T, N> gather(AccessorT acc, OffsetSimdViewT byte_offsets,
+///                   simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                 // (lacc-ga-8)
+/// simd<T, N> gather(AccessorT acc, OffsetSimdViewT byte_offsets,
+///                   PropertyListT props = {});                 // (lacc-ga-9)
+
+/// template <typename T, int N, int VS, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+///                   simd_mask<N / VS> mask, simd<T, N> pass_thru,
+///                   PropertyListT props = {});                 // (lacc-ga-1)
+/// Supported platforms: DG2, PVC only - Temporary restriction for the variant
+/// with pass_thru operand. The only exception: DG2/PVC is not required if
+/// the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used.
+///
+/// Loads ("gathers") elements of the type 'T' from memory locations addressed
+/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns
+/// the loaded elements.
+/// Access to any element's memory location can be disabled via the input vector
+/// of predicates \p mask. If mask[i] is unset, then the load from
+/// (acc + byte_offsets[i]) is skipped and the corresponding i-th element from
+/// \p pass_thru operand is returned.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param acc Accessor referencing the data to load.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param mask The access mask.
+/// @param pass_thru The vector pass through values.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+/// @return A vector of elements read.
+template <typename T, int N, int VS, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+       simd_mask<N / VS> mask, simd<T, N> pass_thru, PropertyListT props = {}) {
+  return slm_gather<T, N, VS>(byte_offsets +
+                                  __ESIMD_DNS::localAccessorToOffset(acc),
+                              mask, pass_thru, props);
+}
+
+/// template <typename T, int N, int VS, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+///                   simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                  // (lacc-ga-2)
+/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if
+/// VS == 1 and the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used or sizeof(T) <=
+/// 4 and N = {1,2,4,8,16,32}
+///
+/// Loads ("gathers") elements of the type 'T' from memory locations addressed
+/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns
+/// the loaded elements.
+/// Access to any element's memory location can be disabled via the input vector
+/// of predicates \p mask. If mask[i] is unset, then the load from
+/// (acc + byte_offsets[i]) is skipped and the corresponding i-th element of
+/// the returned vector is undefined.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param acc Accessor referencing the data to load.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+/// @return A vector of elements read.
+template <typename T, int N, int VS, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+       simd_mask<N / VS> mask, PropertyListT props = {}) {
+  return slm_gather<T, N, VS>(
+      byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
+}
+
+/// template <typename T, int N, int VS, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+///                   PropertyListT props = {});                  // (lacc-ga-3)
+/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if
+/// VS == 1 and the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used or sizeof(T) <=
+/// 4 and N = {1,2,4,8,16,32}
+///
+/// Loads ("gathers") elements of the type 'T' from memory locations addressed
+/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns
+/// the loaded elements.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param acc Accessor referencing the data to load.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+/// @return A vector of elements read.
+template <typename T, int N, int VS, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, simd<uint32_t, N / VS> byte_offsets,
+       PropertyListT props = {}) {
+  return slm_gather<T, N, VS>(
+      byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
+}
+
+/// template <typename T, int N, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
+///                   simd_mask<N> mask, simd<T, N> pass_thru,
+///                   PropertyListT props = {});                  // (lacc-ga-4)
+/// This function is identical to (lacc-ga-1) except that vector size is fixed
+/// to 1. This variant is added for convenience and lets the user omit the
+/// template arguments and call the function as 'gather(acc, byte_offsets, mask,
+/// pass_thru);'.
+// Dev note: the mask type was turned into template parameter `MaskT` to
+// avoid the conflicts of this prototype with the old gather() function
+// accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
+// for calls like this: gather(acc, byte_offsets_simd, 0, mask);
+template <typename T, int N, typename AccessorT, typename MaskT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     std::is_same_v<MaskT, simd_mask<N>> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
+       simd<T, N> pass_thru, PropertyListT props = {}) {
+  return slm_gather<T, N>(byte_offsets +
+                              __ESIMD_DNS::localAccessorToOffset(acc),
+                          mask, pass_thru, props);
+}
+
+/// template <typename T, int N, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
+///                   simd_mask<N> mask, PropertyListT props      // (lacc-ga-5)
+/// This function is identical to (lacc-ga-2) except that vector size is fixed
+/// to 1. This variant is added for convenience and let user omit the template
+/// arguments and call the function as 'gather(acc, byte_offsets, mask);'.
+// Dev note: the mask type was turned into template parameter `MaskT` to
+// avoid the conflicts of this prototype with the old gather() function
+// accepting a 'global_offset' parameter and avoid 'ambiguous call' errors
+// for calls like this: gather(acc, byte_offsets_simd, 0);
+template <typename T, int N, typename AccessorT, typename MaskT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     std::is_same_v<MaskT, simd_mask<N>> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, simd<uint32_t, N> byte_offsets, MaskT mask,
+       PropertyListT props = {}) {
+  return slm_gather<T, N>(
+      byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props);
+}
+
+/// template <typename T, int N, typename AccessorT,
+///           typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
+///                   PropertyListT props = {});                  // (lacc-ga-6)
+/// This function is identical to (lacc-ga-3) except that vector size is fixed
+/// to 1. This variant is added for convenience and let user omit the template
+/// arguments and call the function as 'gather(acc, byte_offsets);'.
+template <typename T, int N, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, simd<uint32_t, N> byte_offsets,
+       PropertyListT props = {}) {
+  return slm_gather<T, N>(
+      byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props);
+}
+
+/// template <typename T, int N, int VS = 1,
+///           typename OffsetSimdViewT,
+//            typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, OffsetSimdViewT byte_offsets,
+///                   simd_mask<N / VS> mask, simd<T, N> pass_thru,
+///                   PropertyListT props = {});                  // (lacc-ga-7)
+/// This function is identical to (lacc-ga-1) except that the \p byte_offsets
+/// is represented as \c simd_view.
+template <typename T, int N, int VS = 1, typename AccessorT,
+          typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     detail::is_simd_view_type_v<OffsetSimdViewT> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
+       simd<T, N> pass_thru, PropertyListT props = {}) {
+  return gather<T, N, VS>(acc, byte_offsets.read(), mask, pass_thru, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename AccessorT,
+///           typename OffsetSimdViewT,
+//            typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, OffsetSimdViewT byte_offsets,
+///                   simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                  // (lacc-ga-8)
+/// This function is identical to (lacc-ga-2) except that the \p byte_offsets
+/// is represented as \c simd_view.
+template <typename T, int N, int VS = 1, typename AccessorT,
+          typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     detail::is_simd_view_type_v<OffsetSimdViewT> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
+       PropertyListT props = {}) {
+  return gather<T, N, VS>(acc, byte_offsets.read(), mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename AccessorT,
+///           typename OffsetSimdViewT,
+//            typename PropertyListT = empty_properties_t>
+/// simd<T, N> gather(AccessorT acc, OffsetSimdViewT byte_offsets,
+///                   PropertyListT props = {});                  // (lacc-ga-9)
+/// This function is identical to (lacc-ga-3) except that the \p byte_offsets
+/// is represented as \c simd_view.
+template <typename T, int N, int VS = 1, typename AccessorT,
+          typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    (detail::is_local_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_read> &&
+     detail::is_simd_view_type_v<OffsetSimdViewT> &&
+     ext::oneapi::experimental::is_property_list_v<PropertyListT>),
+    simd<T, N>>
+gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
+  return gather<T, N, VS>(acc, byte_offsets.read(), props);
+}
+
 /// Variant of gather that uses local accessor as a parameter
 ///
 /// Collects elements located at given offsets in an accessor and returns them
@@ -7411,7 +7722,7 @@ __ESIMD_API
     std::enable_if_t<detail::is_local_accessor_with_v<
                          AccessorTy, detail::accessor_mode_cap::can_read>,
                      simd<T, N>>
-    gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset = 0,
+    gather(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
            simd_mask<N> mask = 1) {
   return slm_gather<T, N>(
       offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask);
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp
index 03ef9bc46c483..e34f259c093ec 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp
@@ -508,7 +508,7 @@ bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) {
        uint32_t GlobalElemOffset = GlobalID * N;
        uint32_t LocalElemOffset = LocalID * N;
 
-       // Allocate a bit more to safely initialize it with 4-element chunks.
+       // Allocate a bit more to safely initialize it with 8-element chunks.
        constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
        slm_init<SLMSize>();
 
@@ -761,3 +761,254 @@ template <typename T, TestFeatures Features> bool testACC(queue Q) {
   }
   return Passed;
 }
+
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
+          bool UseProperties, typename PropertiesT>
+bool testLACC(queue Q, uint32_t MaskStride, PropertiesT) {
+
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  static_assert(!UsePassThru || UseMask,
+                "PassThru cannot be used without using mask");
+
+  constexpr uint32_t Groups = 8;
+  constexpr uint32_t Threads = 16;
+
+  std::cout << "Running case: T=" << esimd_test::type_name<T>() << ", N=" << N
+            << ", VS=" << VS << ", MaskStride=" << MaskStride
+            << ", Groups=" << Groups << ", Threads=" << Threads
+            << ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru
+            << ", use_properties=" << UseProperties << std::endl;
+
+  uint16_t Size = Groups * Threads * N;
+  using Tuint = esimd_test::uint_type_t<sizeof(T)>;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  T *Out = sycl::malloc_shared<T>(Size, Q);
+  std::memset(Out, 0, Size * sizeof(T));
+
+  T *In = sycl::malloc_shared<T>(Size * 2, Q);
+  for (int I = 0; I < Size; I++)
+    In[I] = esimd_test::getRandomValue<T>();
+
+  try {
+    buffer<T, 1> InBuf(In, Size * 2);
+    Q.submit([&](handler &CGH) {
+       // Allocate a bit more to safely initialize it with 8-element chunks.
+       constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
+
+       auto InAcc = local_accessor<T, 1>(SLMSize, CGH);
+
+       CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
+         uint16_t GlobalID = NDI.get_global_id(0);
+         uint16_t LocalID = NDI.get_local_id(0);
+         uint32_t GlobalElemOffset = GlobalID * N;
+         uint32_t LocalElemOffset = LocalID * N;
+
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I += 8) {
+             simd<T, 8> InVec(In + GlobalElemOffset + I);
+             simd<uint32_t, 8> Offsets(I * sizeof(T), sizeof(T));
+             scatter<T>(InAcc, Offsets, InVec);
+           }
+         }
+         barrier();
+         PropertiesT Props{};
+
+         simd<OffsetT, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
+                                             VS * sizeof(T));
+         simd_view ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+
+         simd_mask<NOffsets> Pred;
+         for (int I = 0; I < NOffsets; I++)
+           Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+
+         using Tuint = esimd_test::uint_type_t<sizeof(T)>;
+         simd<Tuint, N> PassThruInt(GlobalElemOffset, 1);
+         simd<T, N> PassThru = PassThruInt.template bit_cast_view<T>();
+         auto PassThruView = PassThru.template select<N, 1>(0);
+
+         simd<T, N> Vals;
+         if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+           if constexpr (UsePassThru) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru,
+                                         Props);
+               else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView,
+                                         Props);
+               else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru,
+                                         Props);
+               else // ByteOffset - view, PassThru - view
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
+                                         PassThruView, Props);
+             } else {                 // UseProperties is false
+               if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru);
+               else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+                 Vals =
+                     gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView);
+               else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+                 Vals =
+                     gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru);
+               else // ByteOffset - view, PassThru - view
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
+                                         PassThruView);
+             }
+           } else if constexpr (UseMask) { // UsePassThru is false
+             if constexpr (UseProperties) {
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, Props);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, Props);
+             } else {                 // UseProperties is false
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred);
+             }
+           } else { // UseMask is false, UsePassThru is false
+             if constexpr (UseProperties) {
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsets, Props);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Props);
+             } else {                 // UseProperties is false
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsets);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N, VS>(InAcc, ByteOffsetsView);
+             }
+           }
+         } else {
+           // if (VS == 1) then <T, N, VS> can often be omitted - test it here.
+           // The variants accepting simd_view for 'PassThru' operand though
+           // still require <T, N> to be specified explicitly to help
+           // C++ FE do simd to simd_view matching.
+           if constexpr (UsePassThru) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+                 Vals = gather<T>(InAcc, ByteOffsets, Pred, PassThru, Props);
+               else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+                 Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView,
+                                     Props);
+               else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+                 Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, Props);
+               else // ByteOffset - view, PassThru - view
+                 Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView,
+                                     Props);
+             } else {                 // UseProperties is false
+               if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+                 Vals = gather(InAcc, ByteOffsets, Pred, PassThru);
+               else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+                 Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView);
+               else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+                 Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThru);
+               else // ByteOffset - view, PassThru - view
+                 Vals =
+                     gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView);
+             }
+           } else if constexpr (UseMask) { // UsePassThru is false
+             if constexpr (UseProperties) {
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T>(InAcc, ByteOffsets, Pred, Props);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, Props);
+             } else {                 // UseProperties is false
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T>(InAcc, ByteOffsets, Pred);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred);
+             }
+           } else { // UsePassThru is false, UseMask is false
+             if constexpr (UseProperties) {
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T>(InAcc, ByteOffsets, Props);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N>(InAcc, ByteOffsetsView, Props);
+             } else {
+               if (GlobalID % 2 == 0) // ByteOffset - simd
+                 Vals = gather<T>(InAcc, ByteOffsets);
+               else // ByteOffset - simd_view
+                 Vals = gather<T, N>(InAcc, ByteOffsetsView);
+             }
+           }
+         } // end if (VS == 1)
+         Vals.copy_to(Out + GlobalID * N);
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(In, Q);
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
+  if (!Passed)
+    std::cout << "Case FAILED" << std::endl;
+
+  sycl::free(In, Q);
+  sycl::free(Out, Q);
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testLACC(queue Q) {
+  constexpr bool UseMask = true;
+  constexpr bool UsePassThru = true;
+  constexpr bool UseProperties = true;
+
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+  Passed &= testLACC<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testLACC<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testLACC<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testLACC<T, 8, 1, UseMask, !UsePassThru, !UseProperties>(
+      Q, 3, AlignElemProps);
+  Passed &= testLACC<T, 16, 1, UseMask, !UsePassThru, UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testLACC<T, 32, 1, UseMask, !UsePassThru, !UseProperties>(
+      Q, 3, AlignElemProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{alignment<sizeof(T)>};
+    Passed &= testLACC<T, 1, 1, !UseMask, !UsePassThru, UseProperties>(
+        Q, 2, LSCProps);
+    Passed &=
+        testLACC<T, 2, 1, UseMask, !UsePassThru, UseProperties>(Q, 2, LSCProps);
+    Passed &=
+        testLACC<T, 4, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);
+    Passed &=
+        testLACC<T, 8, 1, UseMask, UsePassThru, UseProperties>(Q, 3, LSCProps);
+
+    Passed &=
+        testLACC<T, 32, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &= testACC<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
+      //    Q, 3, AlignElemProps);
+
+      Passed &= testLACC<T, 32, 2, !UseMask, !UsePassThru, UseProperties>(
+          Q, 3, AlignElemProps);
+      Passed &= testLACC<T, 32, 2, UseMask, !UsePassThru, UseProperties>(
+          Q, 3, AlignElemProps);
+      Passed &= testLACC<T, 32, 2, UseMask, UsePassThru, UseProperties>(
+          Q, 3, AlignElemProps);
+    }
+  }
+  return Passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp
new file mode 100644
index 0000000000000..329c3aa977286
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp
@@ -0,0 +1,37 @@
+//==------- gather_lacc.cpp - DPC++ ESIMD on-device test ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES-INTEL-DRIVER: lin: 26690, win: 101.4576
+// Use per-kernel compilation to have more information about failing cases.
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::gather() functions accepting Local ACCESSOR
+// and optional compile-time esimd::properties.
+// The gather() calls in this test do not use VS > 1 (number of loads per
+// offset) to not impose using DG2/PVC features.
+
+#include "Inputs/gather.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testLACC<int8_t, TestFeatures>(Q);
+  Passed &= testLACC<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testLACC<sycl::half, TestFeatures>(Q);
+  Passed &= testLACC<uint32_t, TestFeatures>(Q);
+  Passed &= testLACC<float, TestFeatures>(Q);
+  Passed &=
+      testLACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp
new file mode 100644
index 0000000000000..ce13ee56f367e
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp
@@ -0,0 +1,40 @@
+//==------- gather_lacc_dg2_pvc.cpp - DPC++ ESIMD on-device test ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc
+// REQUIRES-INTEL-DRIVER: lin: 26690, win: 101.4576
+
+// Use per-kernel compilation to have more information about failing cases.
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::gather() functions accepting Local ACCESSOR
+// and optional compile-time esimd::properties.
+// The gather() calls in this test can use  VS > 1 (number of loads per offset).
+
+#include "Inputs/gather.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::DG2;
+  bool Passed = true;
+
+  Passed &= testLACC<int8_t, TestFeatures>(Q);
+  Passed &= testLACC<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testLACC<sycl::half, TestFeatures>(Q);
+  Passed &= testLACC<uint32_t, TestFeatures>(Q);
+  Passed &= testLACC<float, TestFeatures>(Q);
+  Passed &=
+      testLACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 31dbc3e889f4b..3f653adb5e476 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -31,7 +31,7 @@ test_block_store(AccType &, LocalAccType &local_acc, float *, int byte_offset32,
                  size_t byte_offset64);
 
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
-test_gather_scatter(AccType &, float *, int byte_offset32,
+test_gather_scatter(AccType &, LocalAccType &, float *, int byte_offset32,
                     size_t byte_offset64);
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
 test_slm_gather_scatter(int byte_offset32);
@@ -48,7 +48,7 @@ class EsimdFunctor {
     test_block_load(acc, local_acc, ptr, byte_offset32, byte_offset64);
     test_atomic_update(acc, local_acc_int, ptr, byte_offset32, byte_offset64);
     test_block_store(acc, local_acc, ptr, byte_offset32, byte_offset64);
-    test_gather_scatter(acc, ptr, byte_offset32, byte_offset64);
+    test_gather_scatter(acc, local_acc, ptr, byte_offset32, byte_offset64);
     test_slm_gather_scatter(byte_offset32);
   }
 };
@@ -939,8 +939,8 @@ test_block_store(AccType &acc, LocalAccType &local_acc, float *ptrf,
 
 // CHECK-LABEL: define {{.*}} @_Z19test_gather_scatter{{.*}}
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
-test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32,
-                    size_t byte_offset64) {
+test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf,
+                    int byte_offset32, size_t byte_offset64) {
   properties props_cache_load{cache_hint_L1<cache_hint::uncached>,
                               cache_hint_L2<cache_hint::uncached>,
                               alignment<8>};
@@ -980,6 +980,10 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32,
   // 6) gather(acc, offsets, mask): offsets is simd or simd_view
   // 7) gather(acc, offsets, mask, pass_thru)
   // 8) gather(acc, ...): same as (5), (6), (7) above, but with VS > 1.
+  // 9) gather(lacc, offsets): offsets is simd or simd_view
+  // 10) gather(lacc, offsets, mask): offsets is simd or simd_view
+  // 11) gather(lacc, offsets, mask, pass_thru)
+  // 12) gather(lacc, ...): same as (9), (10), (11) above, but with VS > 1.
 
   // 1) gather(usm, offsets): offsets is simd or simd_view
   // CHECK-COUNT-4: call <32 x float> @llvm.masked.gather.v32f32.v32p4(<32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}})
@@ -1154,6 +1158,72 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32,
   acc_res = gather<float, 32, 2>(acc, ioffset_n16_view, mask_n16,
                                  pass_thru_view, props_align4);
 
+  // 9) gather(lacc, offsets): offsets is simd or simd_view
+  // CHECK-COUNT-16: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}})
+  acc_res = gather<float>(local_acc, ioffset_n32);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32_view);
+  acc_res = gather<float>(local_acc, ioffset_n32, props_align4);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32_view, props_align4);
+
+  // 10) gather(lacc, offsets, mask): offsets is simd or simd_view
+  acc_res = gather<float>(local_acc, ioffset_n32, mask_n32);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32_view, mask_n32);
+  acc_res = gather<float>(local_acc, ioffset_n32, mask_n32, props_align4);
+  acc_res =
+      gather<float, 32>(local_acc, ioffset_n32_view, mask_n32, props_align4);
+
+  // 11) gather(lacc, offsets, mask, pass_thru)
+  acc_res = gather<float>(local_acc, ioffset_n32, mask_n32, pass_thru);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32_view, mask_n32, pass_thru);
+  acc_res =
+      gather<float>(local_acc, ioffset_n32, mask_n32, pass_thru, props_align4);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32_view, mask_n32, pass_thru,
+                              props_align4);
+
+  acc_res = gather<float, 32>(local_acc, ioffset_n32, mask_n32, pass_thru_view);
+  acc_res =
+      gather<float, 32>(local_acc, ioffset_n32_view, mask_n32, pass_thru_view);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32, mask_n32, pass_thru_view,
+                              props_align4);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32_view, mask_n32,
+                              pass_thru_view, props_align4);
+
+  // 12) gather(lacc, ...): same as (9), (10), (11) above, but with VS > 1.
+  // CHECK-COUNT-16: call <32 x i32> @llvm.genx.lsc.load.merge.slm.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}})
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16_view);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16, props_align4);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16_view, props_align4);
+
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16, mask_n16);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16_view, mask_n16);
+  acc_res =
+      gather<float, 32, 2>(local_acc, ioffset_n16, mask_n16, props_align4);
+  acc_res =
+      gather<float, 32, 2>(local_acc, ioffset_n16_view, mask_n16, props_align4);
+
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16, mask_n16, pass_thru);
+  acc_res =
+      gather<float, 32, 2>(local_acc, ioffset_n16_view, mask_n16, pass_thru);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16, mask_n16, pass_thru,
+                                 props_align4);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16_view, mask_n16,
+                                 pass_thru, props_align4);
+
+  acc_res =
+      gather<float, 32, 2>(local_acc, ioffset_n16, mask_n16, pass_thru_view);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16_view, mask_n16,
+                                 pass_thru_view);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16, mask_n16,
+                                 pass_thru_view, props_align4);
+  acc_res = gather<float, 32, 2>(local_acc, ioffset_n16_view, mask_n16,
+                                 pass_thru_view, props_align4);
+
+  // Validate that a new API doesn't conflict with the old API.
+  // CHECK-COUNT-2: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}})
+  acc_res = gather<float, 32>(local_acc, ioffset_n32, 0);
+  acc_res = gather<float, 32>(local_acc, ioffset_n32, 0, mask_n32);
+
   // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}})
   scatter(ptrf, ioffset_n32, usm, mask_n32);
 

From e4113f1cc3efcfa03549566b19b093ff0b388e40 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Mon, 5 Feb 2024 11:12:31 -0800
Subject: [PATCH 4/8] [SYCL][ESIMD]Fix lsc_slm_block_load test (#12484)

---
 sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp | 6 +++---
 sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp  | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp
index 3edd3f5c4557f..50881390a6a7b 100644
--- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp
+++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp
@@ -120,11 +120,11 @@ bool test(queue Q, uint32_t PMask = ~0) {
       uint32_t LID = I % (LocalRange * VL * NChannels);
       uint32_t GID = I / VL;
       bool Pred = (GID & 0x1) == 0;
-      T ExpectedVal = GroupId * 1000000 + LID;
+      Tuint ExpectedVal = GroupId * 1000000 + LID;
       if (TestMergeOperand && !Pred)
-        ExpectedVal = GID + (I % VL);
+        ExpectedVal = sycl::bit_cast<Tuint>((T)(GID + (I % VL)));
 
-      if (Out[I] != ExpectedVal && NErrors++ < 32) {
+      if (sycl::bit_cast<Tuint>(Out[I]) != ExpectedVal && NErrors++ < 32) {
         std::cout << "Error: " << I << ": Value = " << Out[I]
                   << ", Expected value = " << ExpectedVal << std::endl;
       }
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp
index a340c90b2a66b..635b4db519e30 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp
@@ -18,6 +18,13 @@ template <typename T, bool TestMerging> bool test_load(queue Q) {
   Passed &= test<T, 16, 8, 8, VS, Transpose, TestMerging>(Q);
   Passed &= test<T, 2, 4, 32, VS, Transpose, TestMerging>(Q);
   Passed &= test<T, 2, 4, 64, VS, Transpose, TestMerging>(Q);
+
+  Passed &= test<T, 1, 1, 4, VS, !Transpose, TestMerging>(Q);
+  Passed &= test<T, 1, 7, 16, VS, !Transpose, TestMerging>(Q);
+  Passed &= test<T, 4, 7, 16, VS, !Transpose, TestMerging>(Q);
+  Passed &= test<T, 16, 8, 8, VS, !Transpose, TestMerging>(Q);
+  Passed &= test<T, 2, 4, 32, VS, !Transpose, TestMerging>(Q);
+  Passed &= test<T, 2, 4, 64, VS, !Transpose, TestMerging>(Q);
   return Passed;
 }
 

From a2e166941fa5128fce54b50b60c53a718c9c5845 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Mon, 5 Feb 2024 11:15:54 -0800
Subject: [PATCH 5/8] [SYCL] Fix bitselect builtin for integer types (#12598)

This regressed after #11956 as return type wasn't correctly converted
from SPIR-V intrinsic back to SYCL types. This PR fixes that.

In addition, I'm also adding tests for `sycl::select` builtin that was
left unaffected only because we couldn't use SPIR-V intrinsic for its
implementation.
---
 .../detail/builtins/relational_functions.inc  |  8 ++-
 .../Basic/built-ins/marray_relational.cpp     | 58 +++++++++++++++++
 .../Basic/built-ins/scalar_relational.cpp     | 59 +++++++++++++++++
 .../Basic/built-ins/vec_relational.cpp        | 64 +++++++++++++++++++
 4 files changed, 187 insertions(+), 2 deletions(-)
 create mode 100644 sycl/test-e2e/Basic/built-ins/scalar_relational.cpp

diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc
index 64a7d67934da1..d23f144676cdb 100644
--- a/sycl/include/sycl/detail/builtins/relational_functions.inc
+++ b/sycl/include/sycl/detail/builtins/relational_functions.inc
@@ -118,8 +118,12 @@ BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet)
 #undef BUILTIN_REL
 
 #ifdef __SYCL_DEVICE_ONLY__
-DEVICE_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t,
-                     __spirv_ocl_bitselect)
+DEVICE_IMPL_TEMPLATE(
+    THREE_ARGS, bitselect, builtin_enable_generic_t, [](auto... xs) {
+      using ret_ty = detail::builtin_enable_generic_t<THREE_ARGS_TEMPLATE_TYPE>;
+      using detail::builtins::convert_result;
+      return convert_result<ret_ty>(__spirv_ocl_bitselect(xs...));
+    })
 #else
 HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel,
                    default_ret_type)
diff --git a/sycl/test-e2e/Basic/built-ins/marray_relational.cpp b/sycl/test-e2e/Basic/built-ins/marray_relational.cpp
index 3b2f7ccaf990d..36021efa81475 100644
--- a/sycl/test-e2e/Basic/built-ins/marray_relational.cpp
+++ b/sycl/test-e2e/Basic/built-ins/marray_relational.cpp
@@ -111,6 +111,64 @@ int main() {
   TEST2(sycl::any, int, EXPECTED(bool, false), 3, ma7);
   TEST(sycl::bitselect, float, EXPECTED(float, 1.0, 1.0), 2, ma8, ma9, ma10);
   TEST(sycl::select, float, EXPECTED(float, 1.0, 2.0, 8.0), 3, ma5, ma6, c);
+  {
+    // Extra tests for select/bitselect due to special handling required for
+    // integer return types.
+
+    auto Test = [&](auto F, auto Expected, auto... Args) {
+      std::tuple ArgsTuple{Args...};
+      auto Result = std::apply(F, ArgsTuple);
+      static_assert(std::is_same_v<decltype(Expected), decltype(Result)>);
+
+      auto Equal = [](auto x, auto y) {
+        for (size_t i = 0; i < x.size(); ++i)
+          if (x[i] != y[i])
+            return false;
+
+        return true;
+      };
+
+      assert(Equal(Result, Expected));
+
+      sycl::buffer<bool, 1> ResultBuf{1};
+      deviceQueue.submit([&](sycl::handler &cgh) {
+        sycl::accessor Result{ResultBuf, cgh};
+        cgh.single_task([=]() {
+          auto R = std::apply(F, ArgsTuple);
+          static_assert(std::is_same_v<decltype(Expected), decltype(R)>);
+          Result[0] = Equal(R, Expected);
+        });
+      });
+      assert(sycl::host_accessor{ResultBuf}[0]);
+    };
+
+    sycl::marray<char, 2> a{0b1100, 0b0011};
+    sycl::marray<char, 2> b{0b0011, 0b1100};
+    sycl::marray<char, 2> c{0b1010, 0b1010};
+    sycl::marray<char, 2> r{0b0110, 0b1001};
+
+    auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); };
+    Test(BitSelect, r, a, b, c);
+    // Input values/results above are positive, so use the same values for
+    // signed/unsigned char tests.
+    [&](auto... xs) {
+      Test(BitSelect, sycl::marray<signed char, 2>{xs}...);
+    }(r, a, b, c);
+    [&](auto... xs) {
+      Test(BitSelect, sycl::marray<unsigned char, 2>{xs}...);
+    }(r, a, b, c);
+
+    auto Select = [](auto... xs) { return sycl::select(xs...); };
+    sycl::marray<bool, 2> c2{false, true};
+    sycl::marray<char, 2> r2{a[0], b[1]};
+    Test(Select, r2, a, b, c2);
+    [&](auto... xs) {
+      Test(Select, sycl::marray<signed char, 2>{xs}..., c2);
+    }(r2, a, b);
+    [&](auto... xs) {
+      Test(Select, sycl::marray<unsigned char, 2>{xs}..., c2);
+    }(r2, a, b);
+  }
 
   return 0;
 }
diff --git a/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp b/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp
new file mode 100644
index 0000000000000..4c35dd9af6af4
--- /dev/null
+++ b/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp
@@ -0,0 +1,59 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t_preview.out %}
+// RUN: %if preview-breaking-changes-supported %{ %{run} %t_preview.out%}
+
+#include <sycl/sycl.hpp>
+
+template <typename... Ts, typename FuncTy> void TestTypes(FuncTy F) {
+  (F(Ts{}), ...);
+}
+
+int main() {
+  sycl::queue q;
+
+  auto Test = [&](auto F, auto Expected, auto... Args) {
+#if defined(__GNUC__) || defined(__clang__)
+    std::cout << __PRETTY_FUNCTION__ << std::endl;
+#endif
+    std::tuple ArgsTuple{Args...};
+    auto Result = std::apply(F, ArgsTuple);
+    static_assert(std::is_same_v<decltype(Expected), decltype(Result)>);
+    assert(Expected == Result);
+
+    sycl::buffer<bool, 1> ResultBuf{1};
+    q.submit([&](sycl::handler &cgh) {
+      sycl::accessor Result{ResultBuf, cgh};
+      cgh.single_task([=]() {
+        auto R = std::apply(F, ArgsTuple);
+        static_assert(std::is_same_v<decltype(Expected), decltype(R)>);
+        Result[0] = Expected == R;
+      });
+    });
+    assert(sycl::host_accessor{ResultBuf}[0]);
+  };
+
+  auto TestBitSelect = [&](auto type_val) {
+    using T = decltype(type_val);
+    auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); };
+
+    static_assert(std::is_integral_v<T>,
+                  "Only integer test is implemented here!");
+    Test(BitSelect, T{0b0110}, T{0b1100}, T{0b0011}, T{0b1010});
+  };
+
+  TestTypes<signed char, unsigned char, char, long, long long, unsigned long,
+            unsigned long long>(TestBitSelect);
+
+  auto TestSelect = [&](auto type_val) {
+    using T = decltype(type_val);
+    auto Select = [](auto... xs) { return sycl::select(xs...); };
+
+    Test(Select, T{0}, T{1}, T{0}, true);
+    Test(Select, T{1}, T{1}, T{0}, false);
+  };
+
+  TestTypes<signed char, unsigned char, char>(TestSelect);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Basic/built-ins/vec_relational.cpp b/sycl/test-e2e/Basic/built-ins/vec_relational.cpp
index 91ffcc354a275..45f12f22952fa 100644
--- a/sycl/test-e2e/Basic/built-ins/vec_relational.cpp
+++ b/sycl/test-e2e/Basic/built-ins/vec_relational.cpp
@@ -88,6 +88,70 @@ int main() {
   TEST2(sycl::any, int, EXPECTED(int32_t, 0), 3, va7);
   TEST(sycl::bitselect, float, EXPECTED(float, 1.0, 1.0), 2, va8, va9, va10);
   TEST(sycl::select, float, EXPECTED(float, 1.0, 2.0, 8.0), 3, va5, va6, c1);
+  {
+    // Extra tests for select/bitselect due to special handling required for
+    // integer return types.
+
+    auto Test = [&](auto F, auto Expected, auto... Args) {
+      std::tuple ArgsTuple{Args...};
+      auto Result = std::apply(F, ArgsTuple);
+      static_assert(std::is_same_v<decltype(Expected), decltype(Result)>);
+
+      // Note: operator==(vec, vec) return vec.
+      auto Equal = [](auto x, auto y) {
+        for (size_t i = 0; i < x.size(); ++i)
+          if (x[i] != y[i])
+            return false;
+
+        return true;
+      };
+
+      assert(Equal(Result, Expected));
+
+      sycl::buffer<bool, 1> ResultBuf{1};
+      deviceQueue.submit([&](sycl::handler &cgh) {
+        sycl::accessor Result{ResultBuf, cgh};
+        cgh.single_task([=]() {
+          auto R = std::apply(F, ArgsTuple);
+          static_assert(std::is_same_v<decltype(Expected), decltype(R)>);
+          Result[0] = Equal(R, Expected);
+        });
+      });
+      assert(sycl::host_accessor{ResultBuf}[0]);
+    };
+
+    // Note that only int8_t/uint8_t are supported by the bitselect/select
+    // builtins and not all three char data types. Also, use positive numbers
+    // for the values below so that we could use the same for both
+    // signed/unsigned tests.
+    sycl::vec<uint8_t, 2> a{0b1100, 0b0011};
+    sycl::vec<uint8_t, 2> b{0b0011, 0b1100};
+    sycl::vec<uint8_t, 2> c{0b1010, 0b1010};
+    sycl::vec<uint8_t, 2> r{0b0110, 0b1001};
+
+    auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); };
+    Test(BitSelect, r, a, b, c);
+    [&](auto... xs) {
+      Test(BitSelect, xs.template as<sycl::vec<int8_t, 2>>()...);
+    }(r, a, b, c);
+
+    auto Select = [](auto... xs) { return sycl::select(xs...); };
+    sycl::vec<uint8_t, 2> c2{0x7F, 0xFF};
+    sycl::vec<uint8_t, 2> r2{a[0], b[1]};
+
+    Test(Select, r2, a, b, c2);
+    [&](auto... xs) {
+      Test(Select, xs.template as<sycl::vec<int8_t, 2>>()..., c2);
+    }(r2, a, b);
+
+    // Assume that MSB of a signed data type is the leftmost bit (signbit).
+    auto c3 = c2.template as<sycl::vec<int8_t, 2>>();
+
+    Test(Select, r2, a, b, c3);
+    [&](auto... xs) {
+      Test(Select, xs.template as<sycl::vec<int8_t, 2>>()..., c3);
+    }(r2, a, b);
+  }
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
   TEST(sycl::isequal, int32_t, EXPECTED(int32_t, 1, 1), 2, va11.swizzle<0, 1>(),

From 20aee784b2bbf08b5d48f5cdca9833a7c40ec23e Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Mon, 5 Feb 2024 11:16:15 -0800
Subject: [PATCH 6/8] [SYCL][E2E] XPASS->UNSUPPORTED for
 KernelFusion/Reduction/reduction.cpp (#12607)

See notes in
https://github.com/intel/llvm/pull/12540#issuecomment-1927506081
https://github.com/intel/llvm/pull/12586#issuecomment-1927494475
---
 sycl/test-e2e/KernelFusion/Reduction/reduction.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
index 3e2a3932224ed..120679246c64c 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp
@@ -1,6 +1,9 @@
 // RUN: %{build} -fsycl-embed-ir -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: cpu
+//
+// The test fails on opencl:cpu when running on AMD runner and passes when
+// running on Intel Arc GPU runner.
+// UNSUPPORTED: cpu
 
 // Test fusion works with reductions.
 

From e9a1ace965cbdf6e27e4cd468a1d157c2db640b4 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 5 Feb 2024 17:46:49 -0600
Subject: [PATCH 7/8] [ESIMD] Optimize the simd stride constructor (#12553)

simd(base, stride) calls previously were lowered into a long sequence of
INSERT and ADD operations. That sequence is replaced with a vector
equivalent:
  vbase = broadcast base
  vstride = broadcast stride
  vstride_coef = {0, 1, 2, 3, ... N-1}
  vec_result = vbase + vstride * vstride_coef;

---------

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 .../ext/intel/esimd/detail/simd_obj_impl.hpp  | 37 +++++++-----
 .../ESIMD/api/functional/ctors/ctor_fill.hpp  | 16 +----
 .../ctors/ctor_fill_accuracy_fp.cpp           |  3 +-
 .../api/functional/ctors/ctor_fill_core.cpp   | 14 ++++-
 sycl/test/esimd/ctor_codegen.cpp              | 58 ++++++++++++++++---
 5 files changed, 87 insertions(+), 41 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
index ae6137c01fe12..5efd958789f72 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
@@ -124,13 +124,23 @@ constexpr vector_type_t<T, N> make_vector(const T (&&Arr)[N]) {
 }
 
 template <class T, int N, size_t... Is>
-constexpr vector_type_t<T, N> make_vector_impl(T Base, T Stride,
-                                               std::index_sequence<Is...>) {
-  return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
+constexpr auto make_vector_impl(T Base, T Stride, std::index_sequence<Is...>) {
+  if constexpr (std::is_integral_v<T> && N <= 3) {
+    // This sequence is a bit more efficient for integral types and N <= 3.
+    return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
+  } else {
+    using CppT = typename element_type_traits<T>::EnclosingCppT;
+    CppT BaseCpp = Base;
+    CppT StrideCpp = Stride;
+    vector_type_t<CppT, N> VBase = BaseCpp;
+    vector_type_t<CppT, N> VStride = StrideCpp;
+    vector_type_t<CppT, N> VStrideCoef{(CppT)(Is)...};
+    vector_type_t<CppT, N> Result{VBase + VStride * VStrideCoef};
+    return wrapper_type_converter<T>::template to_vector<N>(Result);
+  }
 }
 
-template <class T, int N>
-constexpr vector_type_t<T, N> make_vector(T Base, T Stride) {
+template <class T, int N> constexpr auto make_vector(T Base, T Stride) {
   return make_vector_impl<T, N>(Base, Stride, std::make_index_sequence<N>{});
 }
 
@@ -265,18 +275,13 @@ class [[__sycl_detail__::__uses_aspects__(
   /// are initialized with the arithmetic progression defined by the arguments.
   /// For example, <code>simd<int, 4> x(1, 3)</code> will initialize x to the
   /// <code>{1, 4, 7, 10}</code> sequence.
-  /// @param Val The start of the progression.
+  /// If Ty is a floating-point type and \p Base or \p Step is +/-inf or nan,
+  /// then this constructor has undefined behavior.
+  /// @param Base The start of the progression.
   /// @param Step The step of the progression.
-  simd_obj_impl(Ty Val, Ty Step) noexcept {
-    __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step));
-    if constexpr (is_wrapper_elem_type_v<Ty> || !std::is_integral_v<Ty>) {
-      for (int i = 0; i < N; ++i) {
-        M_data[i] = bitcast_to_raw_type(Val);
-        Val = binary_op<BinOp::add, Ty>(Val, Step);
-      }
-    } else {
-      M_data = make_vector<Ty, N>(Val, Step);
-    }
+  simd_obj_impl(Ty Base, Ty Step) noexcept {
+    __esimd_dbg_print(simd_obj_impl(Ty Base, Ty Step));
+    M_data = make_vector<Ty, N>(Base, Step);
   }
 
   /// Broadcast constructor. Given value is type-converted to the
diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp
index 8ae03e7a9a577..b3c81ec144530 100644
--- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp
+++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp
@@ -246,18 +246,8 @@ class run_test {
     });
     queue.wait_and_throw();
 
-    // Verify the base value was passed as-is
-    if (!are_bitwise_equal(result[0], base_value)) {
-      passed = false;
-      log::fail(TestDescriptionT(data_type, BaseVal, Step),
-                "Unexpected value at index 0, retrieved: ", result[0],
-                ", expected: ", base_value);
-    }
-
-    // Verify the step value works as expected being passed to the fill
-    // constructor.
-    DataT expected_value = base_value;
-    for (size_t i = 1; i < result.size(); ++i) {
+    // Verify the the fill constructor.
+    for (size_t i = 0; i < result.size(); ++i) {
       if constexpr (BaseVal == init_val::nan || Step == init_val::nan) {
 
         if (!std::isnan(result[i])) {
@@ -268,7 +258,7 @@ class run_test {
         }
       } else {
 
-        expected_value += step_value;
+        DataT expected_value = base_value + (DataT)i * step_value;
         if (!are_bitwise_equal(result[i], expected_value)) {
           passed = false;
           log::fail(TestDescriptionT(data_type, BaseVal, Step),
diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp
index 9fcac2c603c30..9d34849e106f8 100644
--- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp
+++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp
@@ -15,8 +15,7 @@
 // The test verifies that simd fill constructor has no precision differences.
 // The test do the following actions:
 //  - call simd with predefined base and step values
-//  - bitwise comparing that output[0] value is equal to base value and
-//    output[i] is equal to output[i -1] + step_value
+//  - bitwise comparing that output[i] is equal to base + i * step_value.
 
 #include "ctor_fill.hpp"
 
diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
index 7d2fc4f7592d8..f8417718a0962 100644
--- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
+++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
@@ -112,10 +112,22 @@ int main(int, char **) {
     }
     {
       const auto types = get_tested_types<tested_types::fp>();
+      {
+        const auto base_values =
+            ctors::get_init_values_pack<init_val::negative>();
+        const auto step_values =
+            ctors::get_init_values_pack<init_val::positive>();
+        passed &= for_all_combinations<ctors::run_test>(
+            types, sizes, contexts, base_values, step_values, queue);
+      }
+      // The test cases below have never been guaranteed to work some certain
+      // way with base and step values set to inf or non. They may or may not
+      // work as expected by the checks in this test.
       {
         const auto base_values =
             ctors::get_init_values_pack<init_val::neg_inf>();
-        const auto step_values = ctors::get_init_values_pack<init_val::max>();
+        const auto step_values =
+            ctors::get_init_values_pack<init_val::positive>();
         passed &= for_all_combinations<ctors::run_test>(
             types, sizes, contexts, base_values, step_values, queue);
       }
diff --git a/sycl/test/esimd/ctor_codegen.cpp b/sycl/test/esimd/ctor_codegen.cpp
index cf86da2a5aad5..9e75e76fdf972 100644
--- a/sycl/test/esimd/ctor_codegen.cpp
+++ b/sycl/test/esimd/ctor_codegen.cpp
@@ -24,22 +24,62 @@ SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION {
 // CHECK-NEXT: }
 }
 
-// Base + step constructor, FP element type, loops exected - don't check.
-SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION {
-  simd<double, 2> val(17, 3);
-  return val;
+// Const base + step constructor, FP element type.
+SYCL_EXTERNAL auto double_base_step_const() SYCL_ESIMD_FUNCTION {
+  // CHECK: define dso_local spir_func void @_Z22double_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
+  return simd<double, 64>{1.0, 3.0};
+  // CHECK: store <64 x double> <double 1.000000e+00, double 4.000000e+00, double 7.000000e+00, double 1.000000e+01, double 1.300000e+01, double 1.600000e+01, double 1.900000e+01, double 2.200000e+01, double 2.500000e+01, double 2.800000e+01, double 3.100000e+01, double 3.400000e+01, double 3.700000e+01, double 4.000000e+01, double 4.300000e+01, double 4.600000e+01, double 4.900000e+01, double 5.200000e+01, double 5.500000e+01, double 5.800000e+01, double 6.100000e+01, double 6.400000e+01, double 6.700000e+01, double 7.000000e+01, double 7.300000e+01, double 7.600000e+01, double 7.900000e+01, double 8.200000e+01, double 8.500000e+01, double 8.800000e+01, double 9.100000e+01, double 9.400000e+01, double 9.700000e+01, double 1.000000e+02, double 1.030000e+02, double 1.060000e+02, double 1.090000e+02, double 1.120000e+02, double 1.150000e+02, double 1.180000e+02, double 1.210000e+02, double 1.240000e+02, double 1.270000e+02, double 1.300000e+02, double 1.330000e+02, double 1.360000e+02, double 1.390000e+02, double 1.420000e+02, double 1.450000e+02, double 1.480000e+02, double 1.510000e+02, double 1.540000e+02, double 1.570000e+02, double 1.600000e+02, double 1.630000e+02, double 1.660000e+02, double 1.690000e+02, double 1.720000e+02, double 1.750000e+02, double 1.780000e+02, double 1.810000e+02, double 1.840000e+02, double 1.870000e+02, double 1.900000e+02>, ptr addrspace(4) %[[RES]]
+  // CHECK-NEXT: ret void
+}
+
+// Variable base + step constructor, FP element type.
+SYCL_EXTERNAL auto double_base_step_var(double base, double step) SYCL_ESIMD_FUNCTION {
+  // CHECK: define dso_local spir_func void @_Z20double_base_step_vardd({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], double noundef %[[BASE:[a-zA-Z0-9_\.]+]], double noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
+  return simd<double, 32>{base, step};
+  // CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[BASE]], i64 0
+  // CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[BASE_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer
+  // CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[STEP]], i64 0
+  // CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[STEP_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer
+  // CHECK: %[[FMA_VEC:[a-zA-Z0-9_\.]+]] = tail call noundef <32 x double> @llvm.fmuladd.v32f64(<32 x double> %[[STEP_VEC]], <32 x double> <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00, double 5.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00, double 9.000000e+00, double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01, double 1.800000e+01, double 1.900000e+01, double 2.000000e+01, double 2.100000e+01, double 2.200000e+01, double 2.300000e+01, double 2.400000e+01, double 2.500000e+01, double 2.600000e+01, double 2.700000e+01, double 2.800000e+01, double 2.900000e+01, double 3.000000e+01, double 3.100000e+01>, <32 x double> %[[BASE_VEC]])
+  // CHECK: store <32 x double> %[[FMA_VEC]], ptr addrspace(4) %[[RES]]
+  // CHECK-NEXT: ret void
 }
 
-// Base + step constructor, integer element type, no loops exected - check.
-SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION {
-  // CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
-  simd<int, 2> val(17, 3);
+// Const base + step constructor, integer element type.
+SYCL_EXTERNAL auto int_base_step_const() SYCL_ESIMD_FUNCTION {
+  // CHECK: define dso_local spir_func void @_Z19int_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
+  simd<int, 16> val(17, 3);
   return val;
-  // CHECK: store <2 x i32> <i32 17, i32 20>, ptr addrspace(4) %[[RES]]
+  // CHECK: store <16 x i32> <i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62>, ptr addrspace(4) %[[RES]]
   // CHECK-NEXT: ret void
   // CHECK-NEXT: }
 }
 
+// Variable base + step constructor, integer element type.
+SYCL_EXTERNAL auto int_base_step_var(int base, int step) SYCL_ESIMD_FUNCTION {
+  // CHECK: define dso_local spir_func void @_Z17int_base_step_varii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
+  return simd<int, 32>{base, step};
+  // CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[BASE]], i64 0
+  // CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[BASE_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer
+  // CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[STEP]], i64 0
+  // CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[STEP_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer
+  // CHECK: %[[MUL_VEC:[a-zA-Z0-9_\.]+]] = mul <32 x i32> %[[STEP_VEC]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: %[[ADD_VEC:[a-zA-Z0-9_\.]+]] = add <32 x i32> %[[BASE_VEC]], %[[MUL_VEC]]
+  // CHECK: store <32 x i32> %[[ADD_VEC]], ptr addrspace(4) %[[RES]]
+  // CHECK-NEXT: ret void
+}
+
+// Variable base + step constructor, integer element type.
+SYCL_EXTERNAL auto int_base_step_var_n2(int base, int step) SYCL_ESIMD_FUNCTION {
+  // CHECK: define dso_local spir_func void @_Z20int_base_step_var_n2ii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
+  return simd<int, 2>{base, step};
+  // CHECK: %[[BASE_VEC_TMP1:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> poison, i32 %[[BASE]], i64 0
+  // CHECK: %[[BASE_INC:[a-zA-Z0-9_\.]+]] = add nsw i32 %[[BASE]], %[[STEP]]
+  // CHECK: %[[RESULT_VEC:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> %[[BASE_VEC_TMP1]], i32 %[[BASE_INC]], i64 1
+  // CHECK: store <2 x i32> %[[RESULT_VEC]], ptr addrspace(4) %[[RES]]
+  // CHECK-NEXT: ret void
+}
+
 // Broadcast constructor, FP element type, no loops exected - check.
 SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION {
 // CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {

From bd86f04d1ce18ece6f5187635597ea977fc93d8f Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Mon, 5 Feb 2024 15:51:49 -0800
Subject: [PATCH 8/8] [SYCL][ESIMD] Remove no-fast-math option for tests
 (#12534)

The tests are passing with the latest xmain/driver
---
 sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp  | 5 +----
 .../ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp        | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
index f8417718a0962..73f35354d322f 100644
--- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
+++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
@@ -5,10 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: remove fno-fast-math option once the issue is investigated and the test
-// is fixed.
-// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 //
 // Test for simd fill constructor for core types.
diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp
index c6056458f5f00..525f9d6176f71 100644
--- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp
+++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp
@@ -5,10 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: remove fno-fast-math option once the issue is investigated and the test
-// is fixed.
-// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 //
 // Test for simd fill constructor for extra fp types.