Merge remote-tracking branch 'upstream/sycl' into georgi/sycl-cuda-ou…

…t-of-resources-registers-error
intel · Feb 6, 2024 · 7e258e9 · 7e258e9
2 parents 4d6eb99 + bd86f04
commit 7e258e9
Show file tree

Hide file tree

Showing 21 changed files with 1,105 additions and 64 deletions.
diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml
@@ -57,7 +57,7 @@ jobs:
           - name: OCL CPU (Intel/GEN12)
             runner: '["Linux", "gen12"]'
             image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
-            image_options: -u 1001
+            image_options: -u 1001 --privileged --cap-add SYS_ADMIN
             target_devices: opencl:cpu
 
           - name: OCL CPU (Intel/Arc)

diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc
@@ -118,8 +118,12 @@ BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet)
 #undef BUILTIN_REL
 
 #ifdef __SYCL_DEVICE_ONLY__
-DEVICE_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t,
-                     __spirv_ocl_bitselect)
+DEVICE_IMPL_TEMPLATE(
+    THREE_ARGS, bitselect, builtin_enable_generic_t, [](auto... xs) {
+      using ret_ty = detail::builtin_enable_generic_t<THREE_ARGS_TEMPLATE_TYPE>;
+      using detail::builtins::convert_result;
+      return convert_result<ret_ty>(__spirv_ocl_bitselect(xs...));
+    })
 #else
 HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel,
                    default_ret_type)

diff --git a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
@@ -124,13 +124,23 @@ constexpr vector_type_t<T, N> make_vector(const T (&&Arr)[N]) {
 }
 
 template <class T, int N, size_t... Is>
-constexpr vector_type_t<T, N> make_vector_impl(T Base, T Stride,
-                                               std::index_sequence<Is...>) {
-  return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
+constexpr auto make_vector_impl(T Base, T Stride, std::index_sequence<Is...>) {
+  if constexpr (std::is_integral_v<T> && N <= 3) {
+    // This sequence is a bit more efficient for integral types and N <= 3.
+    return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
+  } else {
+    using CppT = typename element_type_traits<T>::EnclosingCppT;
+    CppT BaseCpp = Base;
+    CppT StrideCpp = Stride;
+    vector_type_t<CppT, N> VBase = BaseCpp;
+    vector_type_t<CppT, N> VStride = StrideCpp;
+    vector_type_t<CppT, N> VStrideCoef{(CppT)(Is)...};
+    vector_type_t<CppT, N> Result{VBase + VStride * VStrideCoef};
+    return wrapper_type_converter<T>::template to_vector<N>(Result);
+  }
 }
 
-template <class T, int N>
-constexpr vector_type_t<T, N> make_vector(T Base, T Stride) {
+template <class T, int N> constexpr auto make_vector(T Base, T Stride) {
   return make_vector_impl<T, N>(Base, Stride, std::make_index_sequence<N>{});
 }
 
@@ -265,18 +275,13 @@ class [[__sycl_detail__::__uses_aspects__(
   /// are initialized with the arithmetic progression defined by the arguments.
   /// For example, <code>simd<int, 4> x(1, 3)</code> will initialize x to the
   /// <code>{1, 4, 7, 10}</code> sequence.
-  /// @param Val The start of the progression.
+  /// If Ty is a floating-point type and \p Base or \p Step is +/-inf or nan,
+  /// then this constructor has undefined behavior.
+  /// @param Base The start of the progression.
   /// @param Step The step of the progression.
-  simd_obj_impl(Ty Val, Ty Step) noexcept {
-    __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step));
-    if constexpr (is_wrapper_elem_type_v<Ty> || !std::is_integral_v<Ty>) {
-      for (int i = 0; i < N; ++i) {
-        M_data[i] = bitcast_to_raw_type(Val);
-        Val = binary_op<BinOp::add, Ty>(Val, Step);
-      }
-    } else {
-      M_data = make_vector<Ty, N>(Val, Step);
-    }
+  simd_obj_impl(Ty Base, Ty Step) noexcept {
+    __esimd_dbg_print(simd_obj_impl(Ty Base, Ty Step));
+    M_data = make_vector<Ty, N>(Base, Step);
   }
 
   /// Broadcast constructor. Given value is type-converted to the

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp
@@ -718,6 +718,14 @@ getBinaryImageFormat(const unsigned char *ImgData, size_t ImgSize) {
       return PI_DEVICE_BINARY_TYPE_NATIVE;
   }
 
+  if (MatchMagicNumber(std::array{'!', '<', 'a', 'r', 'c', 'h', '>', '\n'}))
+    // "ar" format is used to pack binaries for multiple devices, e.g. via
+    //
+    //   -Xsycl-target-backend=spir64_gen "-device acm-g10,acm-g11"
+    //
+    // option.
+    return PI_DEVICE_BINARY_TYPE_NATIVE;
+
   return PI_DEVICE_BINARY_TYPE_NONE;
 }
 

diff --git a/sycl/test-e2e/Basic/built-ins/marray_relational.cpp b/sycl/test-e2e/Basic/built-ins/marray_relational.cpp
@@ -111,6 +111,64 @@ int main() {
   TEST2(sycl::any, int, EXPECTED(bool, false), 3, ma7);
   TEST(sycl::bitselect, float, EXPECTED(float, 1.0, 1.0), 2, ma8, ma9, ma10);
   TEST(sycl::select, float, EXPECTED(float, 1.0, 2.0, 8.0), 3, ma5, ma6, c);
+  {
+    // Extra tests for select/bitselect due to special handling required for
+    // integer return types.
+
+    auto Test = [&](auto F, auto Expected, auto... Args) {
+      std::tuple ArgsTuple{Args...};
+      auto Result = std::apply(F, ArgsTuple);
+      static_assert(std::is_same_v<decltype(Expected), decltype(Result)>);
+
+      auto Equal = [](auto x, auto y) {
+        for (size_t i = 0; i < x.size(); ++i)
+          if (x[i] != y[i])
+            return false;
+
+        return true;
+      };
+
+      assert(Equal(Result, Expected));
+
+      sycl::buffer<bool, 1> ResultBuf{1};
+      deviceQueue.submit([&](sycl::handler &cgh) {
+        sycl::accessor Result{ResultBuf, cgh};
+        cgh.single_task([=]() {
+          auto R = std::apply(F, ArgsTuple);
+          static_assert(std::is_same_v<decltype(Expected), decltype(R)>);
+          Result[0] = Equal(R, Expected);
+        });
+      });
+      assert(sycl::host_accessor{ResultBuf}[0]);
+    };
+
+    sycl::marray<char, 2> a{0b1100, 0b0011};
+    sycl::marray<char, 2> b{0b0011, 0b1100};
+    sycl::marray<char, 2> c{0b1010, 0b1010};
+    sycl::marray<char, 2> r{0b0110, 0b1001};
+
+    auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); };
+    Test(BitSelect, r, a, b, c);
+    // Input values/results above are positive, so use the same values for
+    // signed/unsigned char tests.
+    [&](auto... xs) {
+      Test(BitSelect, sycl::marray<signed char, 2>{xs}...);
+    }(r, a, b, c);
+    [&](auto... xs) {
+      Test(BitSelect, sycl::marray<unsigned char, 2>{xs}...);
+    }(r, a, b, c);
+
+    auto Select = [](auto... xs) { return sycl::select(xs...); };
+    sycl::marray<bool, 2> c2{false, true};
+    sycl::marray<char, 2> r2{a[0], b[1]};
+    Test(Select, r2, a, b, c2);
+    [&](auto... xs) {
+      Test(Select, sycl::marray<signed char, 2>{xs}..., c2);
+    }(r2, a, b);
+    [&](auto... xs) {
+      Test(Select, sycl::marray<unsigned char, 2>{xs}..., c2);
+    }(r2, a, b);
+  }
 
   return 0;
 }
diff --git a/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp b/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp
@@ -0,0 +1,59 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t_preview.out %}
+// RUN: %if preview-breaking-changes-supported %{ %{run} %t_preview.out%}
+
+#include <sycl/sycl.hpp>
+
+template <typename... Ts, typename FuncTy> void TestTypes(FuncTy F) {
+  (F(Ts{}), ...);
+}
+
+int main() {
+  sycl::queue q;
+
+  auto Test = [&](auto F, auto Expected, auto... Args) {
+#if defined(__GNUC__) || defined(__clang__)
+    std::cout << __PRETTY_FUNCTION__ << std::endl;
+#endif
+    std::tuple ArgsTuple{Args...};
+    auto Result = std::apply(F, ArgsTuple);
+    static_assert(std::is_same_v<decltype(Expected), decltype(Result)>);
+    assert(Expected == Result);
+
+    sycl::buffer<bool, 1> ResultBuf{1};
+    q.submit([&](sycl::handler &cgh) {
+      sycl::accessor Result{ResultBuf, cgh};
+      cgh.single_task([=]() {
+        auto R = std::apply(F, ArgsTuple);
+        static_assert(std::is_same_v<decltype(Expected), decltype(R)>);
+        Result[0] = Expected == R;
+      });
+    });
+    assert(sycl::host_accessor{ResultBuf}[0]);
+  };
+
+  auto TestBitSelect = [&](auto type_val) {
+    using T = decltype(type_val);
+    auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); };
+
+    static_assert(std::is_integral_v<T>,
+                  "Only integer test is implemented here!");
+    Test(BitSelect, T{0b0110}, T{0b1100}, T{0b0011}, T{0b1010});
+  };
+
+  TestTypes<signed char, unsigned char, char, long, long long, unsigned long,
+            unsigned long long>(TestBitSelect);
+
+  auto TestSelect = [&](auto type_val) {
+    using T = decltype(type_val);
+    auto Select = [](auto... xs) { return sycl::select(xs...); };
+
+    Test(Select, T{0}, T{1}, T{0}, true);
+    Test(Select, T{1}, T{1}, T{0}, false);
+  };
+
+  TestTypes<signed char, unsigned char, char>(TestSelect);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Basic/built-ins/vec_relational.cpp b/sycl/test-e2e/Basic/built-ins/vec_relational.cpp
@@ -88,6 +88,70 @@ int main() {
   TEST2(sycl::any, int, EXPECTED(int32_t, 0), 3, va7);
   TEST(sycl::bitselect, float, EXPECTED(float, 1.0, 1.0), 2, va8, va9, va10);
   TEST(sycl::select, float, EXPECTED(float, 1.0, 2.0, 8.0), 3, va5, va6, c1);
+  {
+    // Extra tests for select/bitselect due to special handling required for
+    // integer return types.
+
+    auto Test = [&](auto F, auto Expected, auto... Args) {
+      std::tuple ArgsTuple{Args...};
+      auto Result = std::apply(F, ArgsTuple);
+      static_assert(std::is_same_v<decltype(Expected), decltype(Result)>);
+
+      // Note: operator==(vec, vec) return vec.
+      auto Equal = [](auto x, auto y) {
+        for (size_t i = 0; i < x.size(); ++i)
+          if (x[i] != y[i])
+            return false;
+
+        return true;
+      };
+
+      assert(Equal(Result, Expected));
+
+      sycl::buffer<bool, 1> ResultBuf{1};
+      deviceQueue.submit([&](sycl::handler &cgh) {
+        sycl::accessor Result{ResultBuf, cgh};
+        cgh.single_task([=]() {
+          auto R = std::apply(F, ArgsTuple);
+          static_assert(std::is_same_v<decltype(Expected), decltype(R)>);
+          Result[0] = Equal(R, Expected);
+        });
+      });
+      assert(sycl::host_accessor{ResultBuf}[0]);
+    };
+
+    // Note that only int8_t/uint8_t are supported by the bitselect/select
+    // builtins and not all three char data types. Also, use positive numbers
+    // for the values below so that we could use the same for both
+    // signed/unsigned tests.
+    sycl::vec<uint8_t, 2> a{0b1100, 0b0011};
+    sycl::vec<uint8_t, 2> b{0b0011, 0b1100};
+    sycl::vec<uint8_t, 2> c{0b1010, 0b1010};
+    sycl::vec<uint8_t, 2> r{0b0110, 0b1001};
+
+    auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); };
+    Test(BitSelect, r, a, b, c);
+    [&](auto... xs) {
+      Test(BitSelect, xs.template as<sycl::vec<int8_t, 2>>()...);
+    }(r, a, b, c);
+
+    auto Select = [](auto... xs) { return sycl::select(xs...); };
+    sycl::vec<uint8_t, 2> c2{0x7F, 0xFF};
+    sycl::vec<uint8_t, 2> r2{a[0], b[1]};
+
+    Test(Select, r2, a, b, c2);
+    [&](auto... xs) {
+      Test(Select, xs.template as<sycl::vec<int8_t, 2>>()..., c2);
+    }(r2, a, b);
+
+    // Assume that MSB of a signed data type is the leftmost bit (signbit).
+    auto c3 = c2.template as<sycl::vec<int8_t, 2>>();
+
+    Test(Select, r2, a, b, c3);
+    [&](auto... xs) {
+      Test(Select, xs.template as<sycl::vec<int8_t, 2>>()..., c3);
+    }(r2, a, b);
+  }
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
   TEST(sycl::isequal, int32_t, EXPECTED(int32_t, 1, 1), 2, va11.swizzle<0, 1>(),

diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp
@@ -246,18 +246,8 @@ class run_test {
     });
     queue.wait_and_throw();
 
-    // Verify the base value was passed as-is
-    if (!are_bitwise_equal(result[0], base_value)) {
-      passed = false;
-      log::fail(TestDescriptionT(data_type, BaseVal, Step),
-                "Unexpected value at index 0, retrieved: ", result[0],
-                ", expected: ", base_value);
-    }
-
-    // Verify the step value works as expected being passed to the fill
-    // constructor.
-    DataT expected_value = base_value;
-    for (size_t i = 1; i < result.size(); ++i) {
+    // Verify the the fill constructor.
+    for (size_t i = 0; i < result.size(); ++i) {
       if constexpr (BaseVal == init_val::nan || Step == init_val::nan) {
 
         if (!std::isnan(result[i])) {
@@ -268,7 +258,7 @@ class run_test {
         }
       } else {
 
-        expected_value += step_value;
+        DataT expected_value = base_value + (DataT)i * step_value;
         if (!are_bitwise_equal(result[i], expected_value)) {
           passed = false;
           log::fail(TestDescriptionT(data_type, BaseVal, Step),

diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp
@@ -15,8 +15,7 @@
 // The test verifies that simd fill constructor has no precision differences.
 // The test do the following actions:
 //  - call simd with predefined base and step values
-//  - bitwise comparing that output[0] value is equal to base value and
-//    output[i] is equal to output[i -1] + step_value
+//  - bitwise comparing that output[i] is equal to base + i * step_value.
 
 #include "ctor_fill.hpp"
 

diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
@@ -5,10 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: remove fno-fast-math option once the issue is investigated and the test
-// is fixed.
-// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 //
 // Test for simd fill constructor for core types.
@@ -112,10 +109,22 @@ int main(int, char **) {
     }
     {
       const auto types = get_tested_types<tested_types::fp>();
+      {
+        const auto base_values =
+            ctors::get_init_values_pack<init_val::negative>();
+        const auto step_values =
+            ctors::get_init_values_pack<init_val::positive>();
+        passed &= for_all_combinations<ctors::run_test>(
+            types, sizes, contexts, base_values, step_values, queue);
+      }
+      // The test cases below have never been guaranteed to work some certain
+      // way with base and step values set to inf or non. They may or may not
+      // work as expected by the checks in this test.
       {
         const auto base_values =
             ctors::get_init_values_pack<init_val::neg_inf>();
-        const auto step_values = ctors::get_init_values_pack<init_val::max>();
+        const auto step_values =
+            ctors::get_init_values_pack<init_val::positive>();
         passed &= for_all_combinations<ctors::run_test>(
             types, sizes, contexts, base_values, step_values, queue);
       }

diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp
@@ -5,10 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: remove fno-fast-math option once the issue is investigated and the test
-// is fixed.
-// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 //
 // Test for simd fill constructor for extra fp types.

diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp
@@ -120,11 +120,11 @@ bool test(queue Q, uint32_t PMask = ~0) {
       uint32_t LID = I % (LocalRange * VL * NChannels);
       uint32_t GID = I / VL;
       bool Pred = (GID & 0x1) == 0;
-      T ExpectedVal = GroupId * 1000000 + LID;
+      Tuint ExpectedVal = GroupId * 1000000 + LID;
       if (TestMergeOperand && !Pred)
-        ExpectedVal = GID + (I % VL);
+        ExpectedVal = sycl::bit_cast<Tuint>((T)(GID + (I % VL)));
 
-      if (Out[I] != ExpectedVal && NErrors++ < 32) {
+      if (sycl::bit_cast<Tuint>(Out[I]) != ExpectedVal && NErrors++ < 32) {
         std::cout << "Error: " << I << ": Value = " << Out[I]
                   << ", Expected value = " << ExpectedVal << std::endl;
       }