From 904b634f21e6c1cdea2a0adf10deb99027397c4c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 05:27:11 -0600 Subject: [PATCH 1/4] Work-around issue with sub_group::load, sub_group::store functions These functions are from oneAPI sycl_ext_oneapi_group_load_store extension. https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_load_store.asciidoc The current implementation of these primitives seem to require certain pointer alignment. This PR adds bool template parameter to contig functors for all elementwise functions, and the elementwise_common_impl checks alignment all pointers. If any is not aligned to the expected boundary, a generic implementation is used instead of the one using sg.load/sg.store --- .../kernels/elementwise_functions/abs.hpp | 12 ++- .../kernels/elementwise_functions/acos.hpp | 12 ++- .../kernels/elementwise_functions/acosh.hpp | 6 +- .../kernels/elementwise_functions/add.hpp | 12 ++- .../elementwise_functions/alignment.hpp | 45 +++++++++ .../kernels/elementwise_functions/angle.hpp | 6 +- .../kernels/elementwise_functions/asin.hpp | 12 ++- .../kernels/elementwise_functions/asinh.hpp | 6 +- .../kernels/elementwise_functions/atan.hpp | 12 ++- .../kernels/elementwise_functions/atan2.hpp | 6 +- .../kernels/elementwise_functions/atanh.hpp | 6 +- .../elementwise_functions/bitwise_and.hpp | 12 ++- .../elementwise_functions/bitwise_invert.hpp | 6 +- .../bitwise_left_shift.hpp | 12 ++- .../elementwise_functions/bitwise_or.hpp | 12 ++- .../bitwise_right_shift.hpp | 12 ++- .../elementwise_functions/bitwise_xor.hpp | 12 ++- .../kernels/elementwise_functions/cbrt.hpp | 12 ++- .../kernels/elementwise_functions/ceil.hpp | 12 ++- .../kernels/elementwise_functions/common.hpp | 96 +++++++++++++++---- .../elementwise_functions/common_inplace.hpp | 49 ++++++++-- .../kernels/elementwise_functions/conj.hpp | 12 ++- .../elementwise_functions/copysign.hpp | 6 +- .../kernels/elementwise_functions/cos.hpp | 12 ++- .../kernels/elementwise_functions/cosh.hpp | 12 ++- .../kernels/elementwise_functions/equal.hpp | 6 +- .../kernels/elementwise_functions/exp.hpp | 12 ++- .../kernels/elementwise_functions/exp2.hpp | 12 ++- .../kernels/elementwise_functions/expm1.hpp | 6 +- .../kernels/elementwise_functions/floor.hpp | 6 +- .../elementwise_functions/floor_divide.hpp | 12 ++- .../kernels/elementwise_functions/greater.hpp | 6 +- .../elementwise_functions/greater_equal.hpp | 6 +- .../kernels/elementwise_functions/hypot.hpp | 6 +- .../kernels/elementwise_functions/imag.hpp | 12 ++- .../elementwise_functions/isfinite.hpp | 12 ++- .../kernels/elementwise_functions/isinf.hpp | 12 ++- .../kernels/elementwise_functions/isnan.hpp | 12 ++- .../kernels/elementwise_functions/less.hpp | 6 +- .../elementwise_functions/less_equal.hpp | 6 +- .../kernels/elementwise_functions/log.hpp | 12 ++- .../kernels/elementwise_functions/log10.hpp | 6 +- .../kernels/elementwise_functions/log1p.hpp | 6 +- .../kernels/elementwise_functions/log2.hpp | 12 ++- .../elementwise_functions/logaddexp.hpp | 6 +- .../elementwise_functions/logical_and.hpp | 6 +- .../elementwise_functions/logical_not.hpp | 6 +- .../elementwise_functions/logical_or.hpp | 6 +- .../elementwise_functions/logical_xor.hpp | 6 +- .../kernels/elementwise_functions/maximum.hpp | 6 +- .../kernels/elementwise_functions/minimum.hpp | 6 +- .../elementwise_functions/multiply.hpp | 12 ++- .../elementwise_functions/negative.hpp | 12 ++- .../elementwise_functions/not_equal.hpp | 6 +- .../elementwise_functions/positive.hpp | 12 ++- .../kernels/elementwise_functions/pow.hpp | 12 ++- .../kernels/elementwise_functions/proj.hpp | 12 ++- .../kernels/elementwise_functions/real.hpp | 12 ++- .../elementwise_functions/reciprocal.hpp | 6 +- .../elementwise_functions/remainder.hpp | 12 ++- .../kernels/elementwise_functions/round.hpp | 6 +- .../kernels/elementwise_functions/rsqrt.hpp | 6 +- .../kernels/elementwise_functions/sign.hpp | 12 ++- .../kernels/elementwise_functions/signbit.hpp | 12 ++- .../kernels/elementwise_functions/sin.hpp | 12 ++- .../kernels/elementwise_functions/sinh.hpp | 12 ++- .../kernels/elementwise_functions/sqrt.hpp | 12 ++- .../kernels/elementwise_functions/square.hpp | 6 +- .../elementwise_functions/subtract.hpp | 12 ++- .../kernels/elementwise_functions/tan.hpp | 12 ++- .../kernels/elementwise_functions/tanh.hpp | 12 ++- .../elementwise_functions/true_divide.hpp | 12 ++- .../kernels/elementwise_functions/trunc.hpp | 6 +- .../elementwise_functions.hpp | 73 +++++++++----- 74 files changed, 682 insertions(+), 241 deletions(-) create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp index 911452931e..9e13648163 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -132,9 +132,15 @@ template struct AbsFunctor template -using AbsContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using AbsContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template struct AbsOutputType { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp index c4742e66dc..cf6875c341 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp @@ -145,9 +145,15 @@ template struct AcosFunctor template -using AcosContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using AcosContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using AcosStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp index b736d5b658..a6ffa805d7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp @@ -167,13 +167,15 @@ template struct AcoshFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using AcoshContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using AcoshStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index 1297847831..aae69d98ea 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -123,14 +123,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using AddContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using AddStridedFunctor = @@ -425,13 +427,15 @@ template struct AddInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, AddInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using AddInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp new file mode 100644 index 0000000000..75d9962404 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp @@ -0,0 +1,45 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace alignment_utils +{ + +static constexpr size_t required_alignment = 64; + +template bool is_aligned(Ptr p) +{ + return !(reinterpret_cast(p) % alignment); +} + +template class disabled_sg_loadstore_wrapper_krn; + +} // end of namespace alignment_utils +} // end of namespace kernels +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index 17ef25b0b6..84b7104b47 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -80,13 +80,15 @@ template struct AngleFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstire = true> using AngleContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstire>; template using AngleStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp index 9da5077665..dc5f2c2b18 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp @@ -169,9 +169,15 @@ template struct AsinFunctor template -using AsinContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using AsinContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using AsinStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp index ceab296b91..6d712165a9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp @@ -143,13 +143,15 @@ template struct AsinhFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using AsinhContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using AsinhStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp index a9af6d829e..93c9a6696d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp @@ -145,9 +145,15 @@ template struct AtanFunctor template -using AtanContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using AtanContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using AtanStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp index 8df1667312..ac8c0483c4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp @@ -70,14 +70,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using Atan2ContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using Atan2StridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp index 3be6abb742..4a26cd92b4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp @@ -138,13 +138,15 @@ template struct AtanhFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using AtanhContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using AtanhStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index b9a0d41f93..e4da56cd9e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -93,14 +93,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, BitwiseAndFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseAndStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -296,14 +298,16 @@ template struct BitwiseAndInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseAndInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, BitwiseAndInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseAndInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index 93e715dc4d..cc629594b9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -92,13 +92,15 @@ template struct BitwiseInvertFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseInvertContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseInvertStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index b13971e27a..58ef64e16a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -102,14 +102,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, BitwiseLeftShiftFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseLeftShiftStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -313,14 +315,16 @@ template struct BitwiseLeftShiftInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseLeftShiftInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, BitwiseLeftShiftInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseLeftShiftInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index a07d4ed540..afd24216b2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -92,14 +92,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, BitwiseOrFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseOrStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -292,14 +294,16 @@ template struct BitwiseOrInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseOrInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, BitwiseOrInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseOrInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index ce8537bc31..f1989b8f64 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -103,14 +103,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, BitwiseRightShiftFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseRightShiftStridedFunctor = @@ -317,14 +319,16 @@ template struct BitwiseRightShiftInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseRightShiftInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, BitwiseRightShiftInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseRightShiftInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index e2ce5a5703..7b777528c2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -93,14 +93,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, BitwiseXorFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseXorStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -296,14 +298,16 @@ template struct BitwiseXorInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using BitwiseXorInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, BitwiseXorInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using BitwiseXorInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp index 92584f0dfe..21b8f79b81 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -70,9 +70,15 @@ template struct CbrtFunctor template -using CbrtContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using CbrtContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using CbrtStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp index 0059064ec1..3672de4d9c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp @@ -80,9 +80,15 @@ template struct CeilFunctor template -using CeilContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using CeilContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using CeilStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 5dc4728a65..433bed8614 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -29,6 +29,7 @@ #include #include +#include "kernels/elementwise_functions/alignment.hpp" #include "utils/offset_utils.hpp" namespace dpctl @@ -42,12 +43,18 @@ namespace elementwise_common namespace py = pybind11; +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + /*! @brief Functor for unary function evaluation on contiguous array */ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> struct UnaryContigFunctor { private: @@ -67,7 +74,8 @@ struct UnaryContigFunctor /* Each work-item processes vec_sz elements, contiguous in memory */ /* NOTE: vec_sz must divide sg.max_local_range()[0] */ - if constexpr (UnaryOperatorT::is_constant::value) { + if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value) + { // value of operator is known to be a known constant constexpr resT const_val = UnaryOperatorT::constant_value; @@ -98,7 +106,8 @@ struct UnaryContigFunctor } } } - else if constexpr (UnaryOperatorT::supports_sg_loadstore::value && + else if constexpr (enable_sg_loadstore && + UnaryOperatorT::supports_sg_loadstore::value && UnaryOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); @@ -135,7 +144,8 @@ struct UnaryContigFunctor } } } - else if constexpr (UnaryOperatorT::supports_sg_loadstore::value && + else if constexpr (enable_sg_loadstore && + UnaryOperatorT::supports_sg_loadstore::value && std::is_same_v) { // default: use scalar-value function @@ -177,7 +187,9 @@ struct UnaryContigFunctor } } } - else if constexpr (UnaryOperatorT::supports_sg_loadstore::value) { + else if constexpr (enable_sg_loadstore && + UnaryOperatorT::supports_sg_loadstore::value) + { // default: use scalar-value function auto sg = ndit.get_sub_group(); @@ -264,7 +276,11 @@ struct UnaryStridedFunctor template class UnaryOutputType, - template + template class ContigFunctorT, template class kernel_name, @@ -289,10 +305,28 @@ sycl::event unary_contig_impl(sycl::queue &exec_q, const argTy *arg_tp = reinterpret_cast(arg_p); resTy *res_tp = reinterpret_cast(res_p); - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - ContigFunctorT(arg_tp, res_tp, - nelems)); + if (is_aligned(arg_p) && + is_aligned(res_p)) + { + constexpr bool enable_sg_loadstore = true; + using KernelName = kernel_name; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigFunctorT(arg_tp, res_tp, nelems)); + } + else { + constexpr bool disable_sg_loadstore = false; + using InnerKernelName = kernel_name; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigFunctorT(arg_tp, res_tp, nelems)); + } }); return comp_ev; } @@ -341,7 +375,8 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> struct BinaryContigFunctor { private: @@ -364,7 +399,8 @@ struct BinaryContigFunctor BinaryOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ - if constexpr (BinaryOperatorT::supports_sg_loadstore::value && + if constexpr (enable_sg_loadstore && + BinaryOperatorT::supports_sg_loadstore::value && BinaryOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); @@ -408,7 +444,9 @@ struct BinaryContigFunctor } } } - else if constexpr (BinaryOperatorT::supports_sg_loadstore::value) { + else if constexpr (enable_sg_loadstore && + BinaryOperatorT::supports_sg_loadstore::value) + { auto sg = ndit.get_sub_group(); std::uint8_t sgSize = sg.get_local_range()[0]; std::uint8_t maxsgSize = sg.get_max_local_range()[0]; @@ -714,7 +752,8 @@ template + unsigned int nv, + bool enable_sg_loadstore> class BinaryContigFunctorT, template (arg2_p) + arg2_offset; resTy *res_tp = reinterpret_cast(res_p) + res_offset; - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - BinaryContigFunctorT( - arg1_tp, arg2_tp, res_tp, nelems)); + if (is_aligned(arg1_tp) && + is_aligned(arg2_tp) && + is_aligned(res_tp)) + { + constexpr bool enable_sg_loadstore = true; + using KernelName = + kernel_name; + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + BinaryContigFunctorT(arg1_tp, arg2_tp, + res_tp, nelems)); + } + else { + constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + kernel_name; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + BinaryContigFunctorT(arg1_tp, arg2_tp, + res_tp, nelems)); + } }); return comp_ev; } diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index c4f893a532..107baecf6c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -29,6 +29,8 @@ #include #include +#include "kernels/elementwise_functions/alignment.hpp" + namespace dpctl { namespace tensor @@ -38,11 +40,17 @@ namespace kernels namespace elementwise_common { +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> struct BinaryInplaceContigFunctor { private: @@ -63,7 +71,8 @@ struct BinaryInplaceContigFunctor BinaryInplaceOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ - if constexpr (BinaryInplaceOperatorT::supports_sg_loadstore::value && + if constexpr (enable_sg_loadstore && + BinaryInplaceOperatorT::supports_sg_loadstore::value && BinaryInplaceOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); @@ -103,7 +112,8 @@ struct BinaryInplaceContigFunctor } } } - else if constexpr (BinaryInplaceOperatorT::supports_sg_loadstore::value) + else if constexpr (enable_sg_loadstore && + BinaryInplaceOperatorT::supports_sg_loadstore::value) { auto sg = ndit.get_sub_group(); std::uint8_t sgSize = sg.get_local_range()[0]; @@ -281,7 +291,11 @@ typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)( template + template class BinaryInplaceContigFunctorT, template class kernel_name, @@ -309,10 +323,29 @@ binary_inplace_contig_impl(sycl::queue &exec_q, reinterpret_cast(rhs_p) + rhs_offset; resTy *res_tp = reinterpret_cast(lhs_p) + lhs_offset; - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - BinaryInplaceContigFunctorT( - arg_tp, res_tp, nelems)); + if (is_aligned(arg_tp) && + is_aligned(res_tp)) + { + constexpr bool enable_sg_loadstore = true; + using KernelName = kernel_name; + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + BinaryInplaceContigFunctorT(arg_tp, res_tp, + nelems)); + } + else { + constexpr bool disable_sg_loadstore = true; + using InnerKernelName = kernel_name; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + BinaryInplaceContigFunctorT( + arg_tp, res_tp, nelems)); + } }); return comp_ev; } diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index a3c607aa49..b8ebf34a23 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -88,9 +88,15 @@ template struct ConjFunctor template -using ConjContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using ConjContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using ConjStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp index 43e06cb281..00e926d3d8 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -83,14 +83,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using CopysignContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp index c8cd8ef18c..e804d5f3df 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -170,9 +170,15 @@ template struct CosFunctor template -using CosContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using CosContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using CosStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp index f03d438cbe..16406d5547 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp @@ -159,9 +159,15 @@ template struct CoshFunctor template -using CoshContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using CoshContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using CoshStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp index 9f866aa580..c354b612a7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -104,14 +104,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using EqualContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using EqualStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp index add15c3523..66e84b69bf 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp @@ -128,9 +128,15 @@ template struct ExpFunctor template -using ExpContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using ExpContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using ExpStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp index a22411414b..ae590fc2bf 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -130,9 +130,15 @@ template struct Exp2Functor template -using Exp2ContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using Exp2ContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using Exp2StridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp index f5204e87b3..8e82031388 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -137,13 +137,15 @@ template struct Expm1Functor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using Expm1ContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using Expm1StridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp index 88a20dafe0..a8f810d9ac 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp @@ -80,13 +80,15 @@ template struct FloorFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using FloorContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using FloorStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index 4ba335c98b..d09f376d04 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -131,14 +131,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, FloorDivideFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -378,14 +380,16 @@ template struct FloorDivideInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using FloorDivideInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, FloorDivideInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using FloorDivideInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp index e01360efa7..2ba942fb32 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -101,14 +101,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using GreaterContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using GreaterStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp index f017b7f150..48503c608d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp @@ -102,14 +102,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, GreaterEqualFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using GreaterEqualStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp index fd19d29c0b..64d8a8f059 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -85,14 +85,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using HypotContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using HypotStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp index bb1ff2ebcb..03ed7bad78 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp @@ -80,9 +80,15 @@ template struct ImagFunctor template -using ImagContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using ImagContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using ImagStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp index cef7c96ed2..b0dab6249c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -100,9 +100,15 @@ template struct IsFiniteFunctor template -using IsFiniteContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using IsFiniteContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using IsFiniteStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp index 2720385614..8c805b7934 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -98,9 +98,15 @@ template struct IsInfFunctor template -using IsInfContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using IsInfContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using IsInfStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp index 15551e295a..8b10ce2295 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -96,9 +96,15 @@ template struct IsNanFunctor template -using IsNanContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using IsNanContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using IsNanStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp index 02c7a0d95a..1827ca3185 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp @@ -100,14 +100,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using LessContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using LessStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp index f9f6729968..0b6d06fff3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp @@ -101,14 +101,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, LessEqualFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using LessEqualStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp index 3668551473..48ec92a257 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -84,9 +84,15 @@ template struct LogFunctor template -using LogContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using LogContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using LogStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp index 5997aded5f..6f3c9d1925 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp @@ -103,13 +103,15 @@ template struct Log10Functor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using Log10ContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using Log10StridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp index 11e3fb3f9f..3b417b46b9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp @@ -104,13 +104,15 @@ template struct Log1pFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using Log1pContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using Log1pStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp index 211a5fdb6c..079c5bf94b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp @@ -103,9 +103,15 @@ template struct Log2Functor template -using Log2ContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using Log2ContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using Log2StridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index e918454319..9fb3759779 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -101,14 +101,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, LogAddExpFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp index 988d1ed380..135c264751 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -94,14 +94,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, LogicalAndFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using LogicalAndStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp index 826af2ee37..8a820c5172 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -68,13 +68,15 @@ template struct LogicalNotFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using LogicalNotContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using LogicalNotStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp index 333951e6b5..a444ced41f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp @@ -93,14 +93,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, LogicalOrFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using LogicalOrStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp index ce4bde9e6b..a3d175f413 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -95,14 +95,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, LogicalXorFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using LogicalXorStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp index 8a1990ba7d..22a63882a9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -98,14 +98,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using MaximumContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using MaximumStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp index fb3490ee19..a11a36aeee 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp @@ -98,14 +98,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using MinimumContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using MinimumStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp index 47549b5a74..7aeea6fdc7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -103,14 +103,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using MultiplyContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -414,14 +416,16 @@ template struct MultiplyInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using MultiplyInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, MultiplyInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using MultiplyInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp index bc28aafad7..b67e74438f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -70,9 +70,15 @@ template struct NegativeFunctor template -using NegativeContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using NegativeContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template struct NegativeOutputType { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp index faeab82580..a5bc3a6cc6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -100,14 +100,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using NotEqualContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using NotEqualStridedFunctor = elementwise_common::BinaryStridedFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp index b3e109c76c..6136a55bce 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp @@ -85,9 +85,15 @@ template struct PositiveFunctor template -using PositiveContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using PositiveContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template struct PositiveOutputType { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index 9068e67f10..65214c9533 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -157,14 +157,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using PowContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using PowStridedFunctor = @@ -427,13 +429,15 @@ template struct PowInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, PowInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using PowInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp index 634c29b5ed..2fe1d70bd5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -93,9 +93,15 @@ template struct ProjFunctor template -using ProjContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using ProjContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using ProjStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp index 6a7580d548..94cdaf1496 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp @@ -80,9 +80,15 @@ template struct RealFunctor template -using RealContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using RealContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using RealStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp index c775576867..ecc5959fc3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp @@ -87,13 +87,15 @@ template struct ReciprocalFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using ReciprocalContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using ReciprocalStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index 9e64e6500f..b25c7be91f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -148,14 +148,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using RemainderContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, RemainderFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using RemainderStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -402,14 +404,16 @@ template struct RemainderInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using RemainderInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, RemainderInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using RemainderInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp index 547d31b392..e8221e4c25 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -89,13 +89,15 @@ template struct RoundFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using RoundContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using RoundStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp index d9e0c33081..a7f70337f8 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -73,13 +73,15 @@ template struct RsqrtFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using RsqrtContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using RsqrtStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp index 521e935c16..804b6260f1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp @@ -109,9 +109,15 @@ template struct SignFunctor template -using SignContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using SignContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template struct SignOutputType { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp index 3e961c466d..401db90b63 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp @@ -79,9 +79,15 @@ template struct SignbitFunctor template -using SignbitContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using SignbitContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using SignbitStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp index 97768fc8e9..96c216c475 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp @@ -186,9 +186,15 @@ template struct SinFunctor template -using SinContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using SinContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using SinStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp index fe7dae533b..cd7f998afa 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp @@ -161,9 +161,15 @@ template struct SinhFunctor template -using SinhContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using SinhContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using SinhStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index 2d423502a7..3eea801fb2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -268,9 +268,15 @@ template struct SqrtFunctor template -using SqrtContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using SqrtContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using SqrtStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp index c888c4eefa..656ec47c4b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp @@ -103,13 +103,15 @@ template struct SquareFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using SquareContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using SquareStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index 9447873dec..a24e71e9d2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -86,14 +86,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using SubtractContigFunctor = elementwise_common::BinaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -411,14 +413,16 @@ template struct SubtractInplaceFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using SubtractInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, SubtractInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using SubtractInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp index 2c648edd7b..e2d08cba0d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -136,9 +136,15 @@ template struct TanFunctor template -using TanContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using TanContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using TanStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp index 84ba7a9a3f..13ea6c7eee 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp @@ -131,9 +131,15 @@ template struct TanhFunctor template -using TanhContigFunctor = elementwise_common:: - UnaryContigFunctor, vec_sz, n_vecs>; + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> +using TanhContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs, + enable_sg_loadstore>; template using TanhStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index c72634d106..e063ecef54 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -125,14 +125,16 @@ template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor< argT1, argT2, resT, TrueDivideFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using TrueDivideStridedFunctor = elementwise_common::BinaryStridedFunctor< @@ -509,14 +511,16 @@ struct TrueDivideInplaceTypeMapFactory template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using TrueDivideInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, resT, TrueDivideInplaceFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using TrueDivideInplaceStridedFunctor = diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp index 0e08d966e9..35b0783719 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -77,13 +77,15 @@ template struct TruncFunctor template + unsigned int n_vecs = 2, + bool enable_sg_loadstore = true> using TruncContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, - n_vecs>; + n_vecs, + enable_sg_loadstore>; template using TruncStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 0410229a0a..35d7545f2d 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -33,6 +33,7 @@ #include #include "elementwise_functions_type_utils.hpp" +#include "kernels/elementwise_functions/alignment.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" @@ -48,6 +49,9 @@ namespace tensor namespace py_internal { +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + /*! @brief Template implementing Python API for unary elementwise functions */ template py_binary_ufunc( int nd = dst_nd; const py::ssize_t *shape = src1_shape; - // all args except itemsizes and is_?_contig bools can be modified by - // reference dpctl::tensor::py_internal::simplify_iteration_space_3( nd, shape, src1_strides, src2_strides, dst_strides, // outputs @@ -487,16 +489,29 @@ std::pair py_binary_ufunc( contig_matrix_row_broadcast_dispatch_table[src1_typeid] [src2_typeid]; if (matrix_row_broadcast_fn != nullptr) { - size_t n0 = simplified_shape[0]; - size_t n1 = simplified_shape[1]; - sycl::event comp_ev = matrix_row_broadcast_fn( - exec_q, host_tasks, n0, n1, src1_data, src1_offset, - src2_data, src2_offset, dst_data, dst_offset, depends); - - return std::make_pair( - dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, - host_tasks), - comp_ev); + int src1_itemsize = src1.get_elemsize(); + int src2_itemsize = src2.get_elemsize(); + int dst_itemsize = dst.get_elemsize(); + + if (is_aligned( + src1_data + src1_offset * src1_itemsize) && + is_aligned( + src2_data + src2_offset * src2_itemsize) && + is_aligned( + dst_data + dst_offset * dst_itemsize)) + { + size_t n0 = simplified_shape[0]; + size_t n1 = simplified_shape[1]; + sycl::event comp_ev = matrix_row_broadcast_fn( + exec_q, host_tasks, n0, n1, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, + depends); + + return std::make_pair( + dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, host_tasks), + comp_ev); + } } } if (isEqual(simplified_src1_strides, one_zero_strides) && @@ -507,16 +522,30 @@ std::pair py_binary_ufunc( contig_row_matrix_broadcast_dispatch_table[src1_typeid] [src2_typeid]; if (row_matrix_broadcast_fn != nullptr) { - size_t n0 = simplified_shape[1]; - size_t n1 = simplified_shape[0]; - sycl::event comp_ev = row_matrix_broadcast_fn( - exec_q, host_tasks, n0, n1, src1_data, src1_offset, - src2_data, src2_offset, dst_data, dst_offset, depends); - return std::make_pair( - dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, - host_tasks), - comp_ev); + int src1_itemsize = src1.get_elemsize(); + int src2_itemsize = src2.get_elemsize(); + int dst_itemsize = dst.get_elemsize(); + + if (is_aligned( + src1_data + src1_offset * src1_itemsize) && + is_aligned( + src2_data + src2_offset * src2_itemsize) && + is_aligned( + dst_data + dst_offset * dst_itemsize)) + { + size_t n0 = simplified_shape[1]; + size_t n1 = simplified_shape[0]; + sycl::event comp_ev = row_matrix_broadcast_fn( + exec_q, host_tasks, n0, n1, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, + depends); + + return std::make_pair( + dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, host_tasks), + comp_ev); + } } } } @@ -736,8 +765,6 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, int nd = lhs_nd; const py::ssize_t *shape = rhs_shape; - // all args except itemsizes and is_?_contig bools can be modified by - // reference dpctl::tensor::py_internal::simplify_iteration_space( nd, shape, rhs_strides, lhs_strides, // outputs From 786bec7abfa7b035b22a94b18ddeddd7e45daf47 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 05:59:39 -0600 Subject: [PATCH 2/4] Added tests for abs/not_equal with not-aligned start of array --- dpctl/tests/elementwise/test_abs.py | 15 +++++++++++++++ dpctl/tests/elementwise/test_not_equal.py | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py index 1db17c1b80..ca296911a0 100644 --- a/dpctl/tests/elementwise/test_abs.py +++ b/dpctl/tests/elementwise/test_abs.py @@ -187,3 +187,18 @@ def test_abs_complex_fp_special_values(dtype): tol = dpt.finfo(r.dtype).resolution assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_abs_alignment(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.ones(512, dtype=dtype) + r = dpt.abs(x) + + r2 = dpt.abs(x[1:]) + assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2)) + + dpt.abs(x[:-1], out=r[1:]) + assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2)) diff --git a/dpctl/tests/elementwise/test_not_equal.py b/dpctl/tests/elementwise/test_not_equal.py index 6baef3f145..222faccc39 100644 --- a/dpctl/tests/elementwise/test_not_equal.py +++ b/dpctl/tests/elementwise/test_not_equal.py @@ -188,3 +188,21 @@ def __sycl_usm_array_interface__(self): c = Canary() with pytest.raises(ValueError): dpt.not_equal(a, c) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_not_equal_alignment(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n = 256 + s = dpt.concat((dpt.zeros(n, dtype=dtype), dpt.zeros(n, dtype=dtype))) + + mask = s[:-1] != s[1:] + (pos,) = dpt.nonzero(mask) + assert dpt.all(pos == n) + + out_arr = dpt.zeros(2 * n, dtype=mask.dtype) + dpt.not_equal(s[:-1], s[1:], out=out_arr[1:]) + (pos,) = dpt.nonzero(mask) + assert dpt.all(pos == (n + 1)) From a307c008529a3802063ce67b1be5d71741661cdd Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 8 Dec 2023 15:48:15 -0800 Subject: [PATCH 3/4] Adds sub-group work-around to copy, clip, where Moves alignment.hpp into dpctl/tensor/libtensor/include/kernels Fixes a small typo in angle.hpp --- .../{elementwise_functions => }/alignment.hpp | 0 .../tensor/libtensor/include/kernels/clip.hpp | 42 +++++++++++++--- .../include/kernels/copy_and_cast.hpp | 43 +++++++++++++--- .../kernels/elementwise_functions/angle.hpp | 4 +- .../kernels/elementwise_functions/common.hpp | 2 +- .../elementwise_functions/common_inplace.hpp | 2 +- .../libtensor/include/kernels/where.hpp | 50 ++++++++++++++++--- .../elementwise_functions.hpp | 2 +- 8 files changed, 121 insertions(+), 24 deletions(-) rename dpctl/tensor/libtensor/include/kernels/{elementwise_functions => }/alignment.hpp (100%) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp rename to dpctl/tensor/libtensor/include/kernels/alignment.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp index 9cca9f615b..aff1acb071 100644 --- a/dpctl/tensor/libtensor/include/kernels/clip.hpp +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -32,6 +32,7 @@ #include #include +#include "kernels/alignment.hpp" #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch.hpp" @@ -51,6 +52,11 @@ namespace td_ns = dpctl::tensor::type_dispatch; using namespace dpctl::tensor::offset_utils; +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + template T clip(const T &x, const T &min, const T &max) { using dpctl::tensor::type_utils::is_complex; @@ -73,7 +79,11 @@ template T clip(const T &x, const T &min, const T &max) } } -template class ClipContigFunctor +template +class ClipContigFunctor { private: size_t nelems = 0; @@ -96,7 +106,7 @@ template class ClipContigFunctor void operator()(sycl::nd_item<1> ndit) const { using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { + if constexpr (is_complex::value || !enable_sg_loadstore) { std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; size_t base = ndit.get_global_linear_id(); @@ -195,10 +205,30 @@ sycl::event clip_contig_impl(sycl::queue &q, const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - ClipContigFunctor(nelems, x_tp, min_tp, max_tp, - dst_tp)); + if (is_aligned(x_cp) && + is_aligned(min_cp) && + is_aligned(max_cp) && + is_aligned(dst_cp)) + { + constexpr bool enable_sg_loadstore = true; + using KernelName = clip_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ClipContigFunctor( + nelems, x_tp, min_tp, max_tp, dst_tp)); + } + else { + constexpr bool disable_sg_loadstore = false; + using InnerKernelName = clip_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ClipContigFunctor( + nelems, x_tp, min_tp, max_tp, dst_tp)); + } }); return clip_ev; diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index 9d1c788626..ef24b58ef2 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -29,6 +29,7 @@ #include #include +#include "kernels/alignment.hpp" #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" @@ -44,6 +45,11 @@ namespace copy_and_cast namespace py = pybind11; using namespace dpctl::tensor::offset_utils; +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + template class copy_cast_generic_kernel; @@ -200,7 +206,8 @@ template + int n_vecs = 2, + bool enable_sg_loadstore = true> class ContigCopyFunctor { private: @@ -219,7 +226,9 @@ class ContigCopyFunctor CastFnT fn{}; using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value || is_complex::value) { + if constexpr (!enable_sg_loadstore || is_complex::value || + is_complex::value) + { std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; size_t base = ndit.get_global_linear_id(); @@ -326,10 +335,32 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q, const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - ContigCopyFunctor, vec_sz, - n_vecs>(nelems, src_tp, dst_tp)); + if (is_aligned(src_cp) && + is_aligned(dst_cp)) + { + constexpr bool enable_sg_loadstore = true; + using KernelName = + copy_cast_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, enable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + else { + constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + copy_cast_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, disable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } }); return copy_and_cast_ev; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index 84b7104b47..2759974b93 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -81,14 +81,14 @@ template + bool enable_sg_loadstore = true> using AngleContigFunctor = elementwise_common::UnaryContigFunctor, vec_sz, n_vecs, - enable_sg_loadstire>; + enable_sg_loadstore>; template using AngleStridedFunctor = elementwise_common:: diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 433bed8614..1794dbf721 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -29,7 +29,7 @@ #include #include -#include "kernels/elementwise_functions/alignment.hpp" +#include "kernels/alignment.hpp" #include "utils/offset_utils.hpp" namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index 107baecf6c..deaef5522f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -29,7 +29,7 @@ #include #include -#include "kernels/elementwise_functions/alignment.hpp" +#include "kernels/alignment.hpp" namespace dpctl { diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp index 9558603d5e..a1c0c7cfb0 100644 --- a/dpctl/tensor/libtensor/include/kernels/where.hpp +++ b/dpctl/tensor/libtensor/include/kernels/where.hpp @@ -25,8 +25,6 @@ #pragma once #include "pybind11/numpy.h" #include "pybind11/stl.h" -#include "utils/offset_utils.hpp" -#include "utils/type_utils.hpp" #include #include #include @@ -34,6 +32,10 @@ #include #include +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + namespace dpctl { namespace tensor @@ -47,12 +49,21 @@ namespace py = pybind11; using namespace dpctl::tensor::offset_utils; +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + template class where_strided_kernel; template class where_contig_kernel; -template +template class WhereContigFunctor { private: @@ -76,7 +87,9 @@ class WhereContigFunctor void operator()(sycl::nd_item<1> ndit) const { using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value || is_complex::value) { + if constexpr (!enable_sg_loadstore || is_complex::value || + is_complex::value) + { std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; size_t base = ndit.get_global_linear_id(); @@ -175,10 +188,33 @@ sycl::event where_contig_impl(sycl::queue &q, const auto gws_range = sycl::range<1>(n_groups * lws); const auto lws_range = sycl::range<1>(lws); - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - WhereContigFunctor(nelems, cond_tp, x1_tp, + if (is_aligned(cond_cp) && + is_aligned(x1_cp) && + is_aligned(x2_cp) && + is_aligned(dst_cp)) + { + constexpr bool enable_sg_loadstore = true; + using KernelName = where_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + WhereContigFunctor(nelems, cond_tp, x1_tp, + x2_tp, dst_tp)); + } + else { + constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + where_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + WhereContigFunctor(nelems, cond_tp, x1_tp, x2_tp, dst_tp)); + } }); return where_ev; diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 35d7545f2d..da0137fd5f 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -33,7 +33,7 @@ #include #include "elementwise_functions_type_utils.hpp" -#include "kernels/elementwise_functions/alignment.hpp" +#include "kernels/alignment.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" From 86153f3a1d9ed32c63f72f3baa862304182baab5 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 8 Dec 2023 15:48:30 -0800 Subject: [PATCH 4/4] Adds tests for unaligned clip, where, copy --- dpctl/tests/test_tensor_clip.py | 11 +++++++++++ dpctl/tests/test_usm_ndarray_ctor.py | 9 +++++++++ dpctl/tests/test_usm_ndarray_search_functions.py | 11 +++++++++++ 3 files changed, 31 insertions(+) diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py index 7050b17e7c..39ba35a4a1 100644 --- a/dpctl/tests/test_tensor_clip.py +++ b/dpctl/tests/test_tensor_clip.py @@ -625,3 +625,14 @@ def test_clip_max_weak_types(): with pytest.raises(ValueError): dpt.clip(x, 2.5, m) + + +def test_clip_unaligned(): + get_queue_or_skip() + + x = dpt.full(513, 5, dtype="i4") + a_min = dpt.zeros(512, dtype="i4") + a_max = dpt.full(512, 2, dtype="i4") + + expected = dpt.full(512, 2, dtype="i4") + assert dpt.all(dpt.clip(x[1:], a_min, a_max) == expected) diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index 5e3feb660d..7227e687af 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -1326,6 +1326,15 @@ def test_copy(): assert np.array_equal(dpt.asnumpy(Yk), ref) +def test_copy_unaligned(): + get_queue_or_skip() + + x = dpt.ones(513, dtype="i4") + r = dpt.astype(x[1:], "f4") + + assert dpt.all(r == 1) + + def test_ctor_invalid(): try: m = dpm.MemoryUSMShared(12) diff --git a/dpctl/tests/test_usm_ndarray_search_functions.py b/dpctl/tests/test_usm_ndarray_search_functions.py index 74bd056729..dc62dd1951 100644 --- a/dpctl/tests/test_usm_ndarray_search_functions.py +++ b/dpctl/tests/test_usm_ndarray_search_functions.py @@ -412,3 +412,14 @@ def test_where_order(): condition = dpt.zeros(n, dtype="?", order="C") res = dpt.where(condition, ar1, ar2) assert res.strides == (20, 1) + + +def test_where_unaligned(): + get_queue_or_skip() + + x = dpt.ones(513, dtype="i4") + a = dpt.full(512, 2, dtype="i4") + b = dpt.zeros(512, dtype="i4") + + expected = dpt.full(512, 2, dtype="i4") + assert dpt.all(dpt.where(x[1:], a, b) == expected)