From 904b634f21e6c1cdea2a0adf10deb99027397c4c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 05:27:11 -0600
Subject: [PATCH 1/4] Work-around issue with sub_group::load, sub_group::store
 functions

These functions are from oneAPI sycl_ext_oneapi_group_load_store extension.
https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_load_store.asciidoc

The current implementation of these primitives seem to require certain pointer alignment.

This PR adds bool template parameter to contig functors for all elementwise functions,
and the elementwise_common_impl checks alignment all pointers. If any is not aligned to
the expected boundary, a generic implementation is used instead of the one using
sg.load/sg.store
---
 .../kernels/elementwise_functions/abs.hpp     | 12 ++-
 .../kernels/elementwise_functions/acos.hpp    | 12 ++-
 .../kernels/elementwise_functions/acosh.hpp   |  6 +-
 .../kernels/elementwise_functions/add.hpp     | 12 ++-
 .../elementwise_functions/alignment.hpp       | 45 +++++++++
 .../kernels/elementwise_functions/angle.hpp   |  6 +-
 .../kernels/elementwise_functions/asin.hpp    | 12 ++-
 .../kernels/elementwise_functions/asinh.hpp   |  6 +-
 .../kernels/elementwise_functions/atan.hpp    | 12 ++-
 .../kernels/elementwise_functions/atan2.hpp   |  6 +-
 .../kernels/elementwise_functions/atanh.hpp   |  6 +-
 .../elementwise_functions/bitwise_and.hpp     | 12 ++-
 .../elementwise_functions/bitwise_invert.hpp  |  6 +-
 .../bitwise_left_shift.hpp                    | 12 ++-
 .../elementwise_functions/bitwise_or.hpp      | 12 ++-
 .../bitwise_right_shift.hpp                   | 12 ++-
 .../elementwise_functions/bitwise_xor.hpp     | 12 ++-
 .../kernels/elementwise_functions/cbrt.hpp    | 12 ++-
 .../kernels/elementwise_functions/ceil.hpp    | 12 ++-
 .../kernels/elementwise_functions/common.hpp  | 96 +++++++++++++++----
 .../elementwise_functions/common_inplace.hpp  | 49 ++++++++--
 .../kernels/elementwise_functions/conj.hpp    | 12 ++-
 .../elementwise_functions/copysign.hpp        |  6 +-
 .../kernels/elementwise_functions/cos.hpp     | 12 ++-
 .../kernels/elementwise_functions/cosh.hpp    | 12 ++-
 .../kernels/elementwise_functions/equal.hpp   |  6 +-
 .../kernels/elementwise_functions/exp.hpp     | 12 ++-
 .../kernels/elementwise_functions/exp2.hpp    | 12 ++-
 .../kernels/elementwise_functions/expm1.hpp   |  6 +-
 .../kernels/elementwise_functions/floor.hpp   |  6 +-
 .../elementwise_functions/floor_divide.hpp    | 12 ++-
 .../kernels/elementwise_functions/greater.hpp |  6 +-
 .../elementwise_functions/greater_equal.hpp   |  6 +-
 .../kernels/elementwise_functions/hypot.hpp   |  6 +-
 .../kernels/elementwise_functions/imag.hpp    | 12 ++-
 .../elementwise_functions/isfinite.hpp        | 12 ++-
 .../kernels/elementwise_functions/isinf.hpp   | 12 ++-
 .../kernels/elementwise_functions/isnan.hpp   | 12 ++-
 .../kernels/elementwise_functions/less.hpp    |  6 +-
 .../elementwise_functions/less_equal.hpp      |  6 +-
 .../kernels/elementwise_functions/log.hpp     | 12 ++-
 .../kernels/elementwise_functions/log10.hpp   |  6 +-
 .../kernels/elementwise_functions/log1p.hpp   |  6 +-
 .../kernels/elementwise_functions/log2.hpp    | 12 ++-
 .../elementwise_functions/logaddexp.hpp       |  6 +-
 .../elementwise_functions/logical_and.hpp     |  6 +-
 .../elementwise_functions/logical_not.hpp     |  6 +-
 .../elementwise_functions/logical_or.hpp      |  6 +-
 .../elementwise_functions/logical_xor.hpp     |  6 +-
 .../kernels/elementwise_functions/maximum.hpp |  6 +-
 .../kernels/elementwise_functions/minimum.hpp |  6 +-
 .../elementwise_functions/multiply.hpp        | 12 ++-
 .../elementwise_functions/negative.hpp        | 12 ++-
 .../elementwise_functions/not_equal.hpp       |  6 +-
 .../elementwise_functions/positive.hpp        | 12 ++-
 .../kernels/elementwise_functions/pow.hpp     | 12 ++-
 .../kernels/elementwise_functions/proj.hpp    | 12 ++-
 .../kernels/elementwise_functions/real.hpp    | 12 ++-
 .../elementwise_functions/reciprocal.hpp      |  6 +-
 .../elementwise_functions/remainder.hpp       | 12 ++-
 .../kernels/elementwise_functions/round.hpp   |  6 +-
 .../kernels/elementwise_functions/rsqrt.hpp   |  6 +-
 .../kernels/elementwise_functions/sign.hpp    | 12 ++-
 .../kernels/elementwise_functions/signbit.hpp | 12 ++-
 .../kernels/elementwise_functions/sin.hpp     | 12 ++-
 .../kernels/elementwise_functions/sinh.hpp    | 12 ++-
 .../kernels/elementwise_functions/sqrt.hpp    | 12 ++-
 .../kernels/elementwise_functions/square.hpp  |  6 +-
 .../elementwise_functions/subtract.hpp        | 12 ++-
 .../kernels/elementwise_functions/tan.hpp     | 12 ++-
 .../kernels/elementwise_functions/tanh.hpp    | 12 ++-
 .../elementwise_functions/true_divide.hpp     | 12 ++-
 .../kernels/elementwise_functions/trunc.hpp   |  6 +-
 .../elementwise_functions.hpp                 | 73 +++++++++-----
 74 files changed, 682 insertions(+), 241 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 911452931e..9e13648163 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -132,9 +132,15 @@ template <typename argT, typename resT> struct AbsFunctor
 template <typename argT,
           typename resT = argT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AbsContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, AbsFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AbsContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           AbsFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename T> struct AbsOutputType
 {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index c4742e66dc..cf6875c341 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AcosFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AcosContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, AcosFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AcosContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AcosFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AcosStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index b736d5b658..a6ffa805d7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -167,13 +167,15 @@ template <typename argT, typename resT> struct AcoshFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AcoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AcoshFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AcoshStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 1297847831..aae69d98ea 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -123,14 +123,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AddContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             AddFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using AddStridedFunctor =
@@ -425,13 +427,15 @@ template <typename argT, typename resT> struct AddInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
     resT,
     AddInplaceFunctor<argT, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using AddInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp
new file mode 100644
index 0000000000..75d9962404
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp
@@ -0,0 +1,45 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace alignment_utils
+{
+
+static constexpr size_t required_alignment = 64;
+
+template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
+{
+    return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
+}
+
+template <typename KernelName> class disabled_sg_loadstore_wrapper_krn;
+
+} // end of namespace alignment_utils
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index 17ef25b0b6..84b7104b47 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -80,13 +80,15 @@ template <typename argT, typename resT> struct AngleFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstire = true>
 using AngleContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AngleFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstire>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AngleStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 9da5077665..dc5f2c2b18 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -169,9 +169,15 @@ template <typename argT, typename resT> struct AsinFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AsinContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, AsinFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AsinContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AsinFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AsinStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index ceab296b91..6d712165a9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -143,13 +143,15 @@ template <typename argT, typename resT> struct AsinhFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AsinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AsinhFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AsinhStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index a9af6d829e..93c9a6696d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AtanFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AtanContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, AtanFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AtanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AtanFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AtanStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index 8df1667312..ac8c0483c4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -70,14 +70,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using Atan2ContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             Atan2Functor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using Atan2StridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 3be6abb742..4a26cd92b4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -138,13 +138,15 @@ template <typename argT, typename resT> struct AtanhFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AtanhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AtanhFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AtanhStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index b9a0d41f93..e4da56cd9e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -93,14 +93,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     BitwiseAndFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using BitwiseAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -296,14 +298,16 @@ template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseAndInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         BitwiseAndInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using BitwiseAndInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index 93e715dc4d..cc629594b9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -92,13 +92,15 @@ template <typename argT, typename resT> struct BitwiseInvertFunctor
 template <typename argT,
           typename resT = argT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseInvertContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
                                            resT,
                                            BitwiseInvertFunctor<argT, resT>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using BitwiseInvertStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index b13971e27a..58ef64e16a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -102,14 +102,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     BitwiseLeftShiftFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using BitwiseLeftShiftStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -313,14 +315,16 @@ template <typename argT, typename resT> struct BitwiseLeftShiftInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseLeftShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         BitwiseLeftShiftInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using BitwiseLeftShiftInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index a07d4ed540..afd24216b2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -92,14 +92,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     BitwiseOrFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using BitwiseOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -292,14 +294,16 @@ template <typename argT, typename resT> struct BitwiseOrInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseOrInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         BitwiseOrInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using BitwiseOrInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index ce8537bc31..f1989b8f64 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -103,14 +103,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     BitwiseRightShiftFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using BitwiseRightShiftStridedFunctor =
@@ -317,14 +319,16 @@ template <typename argT, typename resT> struct BitwiseRightShiftInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseRightShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         BitwiseRightShiftInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using BitwiseRightShiftInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index e2ce5a5703..7b777528c2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -93,14 +93,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     BitwiseXorFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using BitwiseXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -296,14 +298,16 @@ template <typename argT, typename resT> struct BitwiseXorInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using BitwiseXorInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         BitwiseXorInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using BitwiseXorInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index 92584f0dfe..21b8f79b81 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -70,9 +70,15 @@ template <typename argT, typename resT> struct CbrtFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using CbrtContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, CbrtFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using CbrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CbrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using CbrtStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
index 0059064ec1..3672de4d9c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -80,9 +80,15 @@ template <typename argT, typename resT> struct CeilFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using CeilContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, CeilFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using CeilContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CeilFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using CeilStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 5dc4728a65..433bed8614 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -29,6 +29,7 @@
 #include <sycl/sycl.hpp>
 #include <utility>
 
+#include "kernels/elementwise_functions/alignment.hpp"
 #include "utils/offset_utils.hpp"
 
 namespace dpctl
@@ -42,12 +43,18 @@ namespace elementwise_common
 
 namespace py = pybind11;
 
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 /*! @brief Functor for unary function evaluation on contiguous array */
 template <typename argT,
           typename resT,
           typename UnaryOperatorT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 struct UnaryContigFunctor
 {
 private:
@@ -67,7 +74,8 @@ struct UnaryContigFunctor
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: vec_sz must divide sg.max_local_range()[0] */
 
-        if constexpr (UnaryOperatorT::is_constant::value) {
+        if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
+        {
             // value of operator is known to be a known constant
             constexpr resT const_val = UnaryOperatorT::constant_value;
 
@@ -98,7 +106,8 @@ struct UnaryContigFunctor
                 }
             }
         }
-        else if constexpr (UnaryOperatorT::supports_sg_loadstore::value &&
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
                            UnaryOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
@@ -135,7 +144,8 @@ struct UnaryContigFunctor
                 }
             }
         }
-        else if constexpr (UnaryOperatorT::supports_sg_loadstore::value &&
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
                            std::is_same_v<resT, argT>)
         {
             // default: use scalar-value function
@@ -177,7 +187,9 @@ struct UnaryContigFunctor
                 }
             }
         }
-        else if constexpr (UnaryOperatorT::supports_sg_loadstore::value) {
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value)
+        {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -264,7 +276,11 @@ struct UnaryStridedFunctor
 template <typename argTy,
           template <typename T>
           class UnaryOutputType,
-          template <typename A, typename R, unsigned int vs, unsigned int nv>
+          template <typename A,
+                    typename R,
+                    unsigned int vs,
+                    unsigned int nv,
+                    bool enable>
           class ContigFunctorT,
           template <typename A, typename R, unsigned int vs, unsigned int nv>
           class kernel_name,
@@ -289,10 +305,28 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
         const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
         resTy *res_tp = reinterpret_cast<resTy *>(res_p);
 
-        cgh.parallel_for<kernel_name<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            ContigFunctorT<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                         nelems));
+        if (is_aligned<required_alignment>(arg_p) &&
+            is_aligned<required_alignment>(res_p))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                               enable_sg_loadstore>(arg_tp, res_tp, nelems));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                               disable_sg_loadstore>(arg_tp, res_tp, nelems));
+        }
     });
     return comp_ev;
 }
@@ -341,7 +375,8 @@ template <typename argT1,
           typename resT,
           typename BinaryOperatorT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 struct BinaryContigFunctor
 {
 private:
@@ -364,7 +399,8 @@ struct BinaryContigFunctor
         BinaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
 
-        if constexpr (BinaryOperatorT::supports_sg_loadstore::value &&
+        if constexpr (enable_sg_loadstore &&
+                      BinaryOperatorT::supports_sg_loadstore::value &&
                       BinaryOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
@@ -408,7 +444,9 @@ struct BinaryContigFunctor
                 }
             }
         }
-        else if constexpr (BinaryOperatorT::supports_sg_loadstore::value) {
+        else if constexpr (enable_sg_loadstore &&
+                           BinaryOperatorT::supports_sg_loadstore::value)
+        {
             auto sg = ndit.get_sub_group();
             std::uint8_t sgSize = sg.get_local_range()[0];
             std::uint8_t maxsgSize = sg.get_max_local_range()[0];
@@ -714,7 +752,8 @@ template <typename argTy1,
                     typename T2,
                     typename T3,
                     unsigned int vs,
-                    unsigned int nv>
+                    unsigned int nv,
+                    bool enable_sg_loadstore>
           class BinaryContigFunctorT,
           template <typename T1,
                     typename T2,
@@ -751,10 +790,31 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
             reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
         resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
 
-        cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs>(
-                arg1_tp, arg2_tp, res_tp, nelems));
+        if (is_aligned<required_alignment>(arg1_tp) &&
+            is_aligned<required_alignment>(arg2_tp) &&
+            is_aligned<required_alignment>(res_tp))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName =
+                kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
+                                     enable_sg_loadstore>(arg1_tp, arg2_tp,
+                                                          res_tp, nelems));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
+                                     disable_sg_loadstore>(arg1_tp, arg2_tp,
+                                                           res_tp, nelems));
+        }
     });
     return comp_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index c4f893a532..107baecf6c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -29,6 +29,8 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
+#include "kernels/elementwise_functions/alignment.hpp"
+
 namespace dpctl
 {
 namespace tensor
@@ -38,11 +40,17 @@ namespace kernels
 namespace elementwise_common
 {
 
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 template <typename argT,
           typename resT,
           typename BinaryInplaceOperatorT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 struct BinaryInplaceContigFunctor
 {
 private:
@@ -63,7 +71,8 @@ struct BinaryInplaceContigFunctor
         BinaryInplaceOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
 
-        if constexpr (BinaryInplaceOperatorT::supports_sg_loadstore::value &&
+        if constexpr (enable_sg_loadstore &&
+                      BinaryInplaceOperatorT::supports_sg_loadstore::value &&
                       BinaryInplaceOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
@@ -103,7 +112,8 @@ struct BinaryInplaceContigFunctor
                 }
             }
         }
-        else if constexpr (BinaryInplaceOperatorT::supports_sg_loadstore::value)
+        else if constexpr (enable_sg_loadstore &&
+                           BinaryInplaceOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
             std::uint8_t sgSize = sg.get_local_range()[0];
@@ -281,7 +291,11 @@ typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)(
 
 template <typename argTy,
           typename resTy,
-          template <typename T1, typename T2, unsigned int vs, unsigned int nv>
+          template <typename T1,
+                    typename T2,
+                    unsigned int vs,
+                    unsigned int nv,
+                    bool enable_sg_loadstore>
           class BinaryInplaceContigFunctorT,
           template <typename T1, typename T2, unsigned int vs, unsigned int nv>
           class kernel_name,
@@ -309,10 +323,29 @@ binary_inplace_contig_impl(sycl::queue &exec_q,
             reinterpret_cast<const argTy *>(rhs_p) + rhs_offset;
         resTy *res_tp = reinterpret_cast<resTy *>(lhs_p) + lhs_offset;
 
-        cgh.parallel_for<kernel_name<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs>(
-                arg_tp, res_tp, nelems));
+        if (is_aligned<required_alignment>(arg_tp) &&
+            is_aligned<required_alignment>(res_tp))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            enable_sg_loadstore>(arg_tp, res_tp,
+                                                                 nelems));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = true;
+            using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            disable_sg_loadstore>(
+                    arg_tp, res_tp, nelems));
+        }
     });
     return comp_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index a3c607aa49..b8ebf34a23 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -88,9 +88,15 @@ template <typename argT, typename resT> struct ConjFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using ConjContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, ConjFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using ConjContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ConjFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using ConjStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
index 43e06cb281..00e926d3d8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -83,14 +83,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using CopysignContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             CopysignFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index c8cd8ef18c..e804d5f3df 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -170,9 +170,15 @@ template <typename argT, typename resT> struct CosFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using CosContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, CosFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using CosContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CosFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using CosStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index f03d438cbe..16406d5547 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -159,9 +159,15 @@ template <typename argT, typename resT> struct CoshFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using CoshContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, CoshFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using CoshContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CoshFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using CoshStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index 9f866aa580..c354b612a7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -104,14 +104,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using EqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             EqualFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using EqualStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
index add15c3523..66e84b69bf 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -128,9 +128,15 @@ template <typename argT, typename resT> struct ExpFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using ExpContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, ExpFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using ExpContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ExpFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using ExpStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
index a22411414b..ae590fc2bf 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -130,9 +130,15 @@ template <typename argT, typename resT> struct Exp2Functor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using Exp2ContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, Exp2Functor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using Exp2ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Exp2Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using Exp2StridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index f5204e87b3..8e82031388 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -137,13 +137,15 @@ template <typename argT, typename resT> struct Expm1Functor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using Expm1ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            Expm1Functor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using Expm1StridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
index 88a20dafe0..a8f810d9ac 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -80,13 +80,15 @@ template <typename argT, typename resT> struct FloorFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using FloorContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            FloorFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using FloorStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 4ba335c98b..d09f376d04 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -131,14 +131,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     FloorDivideFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -378,14 +380,16 @@ template <typename argT, typename resT> struct FloorDivideInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using FloorDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         FloorDivideInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using FloorDivideInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index e01360efa7..2ba942fb32 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -101,14 +101,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using GreaterContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             GreaterFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using GreaterStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index f017b7f150..48503c608d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -102,14 +102,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     GreaterEqualFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using GreaterEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
index fd19d29c0b..64d8a8f059 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -85,14 +85,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using HypotContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             HypotFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using HypotStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
index bb1ff2ebcb..03ed7bad78 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -80,9 +80,15 @@ template <typename argT, typename resT> struct ImagFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using ImagContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, ImagFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using ImagContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ImagFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using ImagStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index cef7c96ed2..b0dab6249c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -100,9 +100,15 @@ template <typename argT, typename resT> struct IsFiniteFunctor
 template <typename argT,
           typename resT = bool,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using IsFiniteContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, IsFiniteFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using IsFiniteContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsFiniteFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using IsFiniteStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index 2720385614..8c805b7934 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -98,9 +98,15 @@ template <typename argT, typename resT> struct IsInfFunctor
 template <typename argT,
           typename resT = bool,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using IsInfContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, IsInfFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using IsInfContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsInfFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using IsInfStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index 15551e295a..8b10ce2295 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -96,9 +96,15 @@ template <typename argT, typename resT> struct IsNanFunctor
 template <typename argT,
           typename resT = bool,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using IsNanContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, IsNanFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using IsNanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsNanFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using IsNanStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index 02c7a0d95a..1827ca3185 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -100,14 +100,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using LessContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             LessFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using LessStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index f9f6729968..0b6d06fff3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -101,14 +101,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     LessEqualFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using LessEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
index 3668551473..48ec92a257 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -84,9 +84,15 @@ template <typename argT, typename resT> struct LogFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using LogContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, LogFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using LogContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           LogFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using LogStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
index 5997aded5f..6f3c9d1925 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -103,13 +103,15 @@ template <typename argT, typename resT> struct Log10Functor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using Log10ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            Log10Functor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using Log10StridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
index 11e3fb3f9f..3b417b46b9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -104,13 +104,15 @@ template <typename argT, typename resT> struct Log1pFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using Log1pContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            Log1pFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using Log1pStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
index 211a5fdb6c..079c5bf94b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -103,9 +103,15 @@ template <typename argT, typename resT> struct Log2Functor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using Log2ContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, Log2Functor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using Log2ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Log2Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using Log2StridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index e918454319..9fb3759779 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -101,14 +101,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     LogAddExpFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
index 988d1ed380..135c264751 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -94,14 +94,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     LogicalAndFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using LogicalAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
index 826af2ee37..8a820c5172 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -68,13 +68,15 @@ template <typename argT, typename resT> struct LogicalNotFunctor
 template <typename argT,
           typename resT = bool,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using LogicalNotContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
                                            resT,
                                            LogicalNotFunctor<argT, resT>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using LogicalNotStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
index 333951e6b5..a444ced41f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -93,14 +93,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     LogicalOrFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using LogicalOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
index ce4bde9e6b..a3d175f413 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -95,14 +95,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     LogicalXorFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using LogicalXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index 8a1990ba7d..22a63882a9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -98,14 +98,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using MaximumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             MaximumFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using MaximumStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index fb3490ee19..a11a36aeee 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -98,14 +98,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using MinimumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             MinimumFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using MinimumStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index 47549b5a74..7aeea6fdc7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -103,14 +103,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using MultiplyContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             MultiplyFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -414,14 +416,16 @@ template <typename argT, typename resT> struct MultiplyInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using MultiplyInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         MultiplyInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using MultiplyInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index bc28aafad7..b67e74438f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -70,9 +70,15 @@ template <typename argT, typename resT> struct NegativeFunctor
 template <typename argT,
           typename resT = argT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using NegativeContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, NegativeFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using NegativeContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           NegativeFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename T> struct NegativeOutputType
 {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index faeab82580..a5bc3a6cc6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -100,14 +100,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using NotEqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             NotEqualFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using NotEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index b3e109c76c..6136a55bce 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -85,9 +85,15 @@ template <typename argT, typename resT> struct PositiveFunctor
 template <typename argT,
           typename resT = argT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using PositiveContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, PositiveFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using PositiveContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           PositiveFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename T> struct PositiveOutputType
 {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index 9068e67f10..65214c9533 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -157,14 +157,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using PowContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             PowFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using PowStridedFunctor =
@@ -427,13 +429,15 @@ template <typename argT, typename resT> struct PowInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
     resT,
     PowInplaceFunctor<argT, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using PowInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
index 634c29b5ed..2fe1d70bd5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -93,9 +93,15 @@ template <typename argT, typename resT> struct ProjFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using ProjContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, ProjFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using ProjContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ProjFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using ProjStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
index 6a7580d548..94cdaf1496 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -80,9 +80,15 @@ template <typename argT, typename resT> struct RealFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using RealContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, RealFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using RealContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RealFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using RealStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
index c775576867..ecc5959fc3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -87,13 +87,15 @@ template <typename argT, typename resT> struct ReciprocalFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using ReciprocalContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            ReciprocalFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using ReciprocalStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 9e64e6500f..b25c7be91f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -148,14 +148,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     RemainderFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using RemainderStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -402,14 +404,16 @@ template <typename argT, typename resT> struct RemainderInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using RemainderInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         RemainderInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using RemainderInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index 547d31b392..e8221e4c25 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -89,13 +89,15 @@ template <typename argT, typename resT> struct RoundFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using RoundContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            RoundFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using RoundStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index d9e0c33081..a7f70337f8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -73,13 +73,15 @@ template <typename argT, typename resT> struct RsqrtFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using RsqrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            RsqrtFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using RsqrtStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index 521e935c16..804b6260f1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -109,9 +109,15 @@ template <typename argT, typename resT> struct SignFunctor
 template <typename argT,
           typename resT = argT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using SignContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, SignFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using SignContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           SignFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename T> struct SignOutputType
 {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index 3e961c466d..401db90b63 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -79,9 +79,15 @@ template <typename argT, typename resT> struct SignbitFunctor
 template <typename argT,
           typename resT = bool,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using SignbitContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, SignbitFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using SignbitContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           SignbitFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using SignbitStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index 97768fc8e9..96c216c475 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -186,9 +186,15 @@ template <typename argT, typename resT> struct SinFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using SinContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, SinFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using SinContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SinFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using SinStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index fe7dae533b..cd7f998afa 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -161,9 +161,15 @@ template <typename argT, typename resT> struct SinhFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using SinhContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, SinhFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using SinhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SinhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using SinhStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 2d423502a7..3eea801fb2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -268,9 +268,15 @@ template <typename argT, typename resT> struct SqrtFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using SqrtContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, SqrtFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using SqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using SqrtStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
index c888c4eefa..656ec47c4b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -103,13 +103,15 @@ template <typename argT, typename resT> struct SquareFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using SquareContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            SquareFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using SquareStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index 9447873dec..a24e71e9d2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -86,14 +86,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using SubtractContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             SubtractFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -411,14 +413,16 @@ template <typename argT, typename resT> struct SubtractInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using SubtractInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         SubtractInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using SubtractInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index 2c648edd7b..e2d08cba0d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -136,9 +136,15 @@ template <typename argT, typename resT> struct TanFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using TanContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, TanFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using TanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TanFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using TanStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index 84ba7a9a3f..13ea6c7eee 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -131,9 +131,15 @@ template <typename argT, typename resT> struct TanhFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using TanhContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, TanhFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using TanhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TanhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using TanhStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index c72634d106..e063ecef54 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -125,14 +125,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
     argT2,
     resT,
     TrueDivideFunctor<argT1, argT2, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using TrueDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
@@ -509,14 +511,16 @@ struct TrueDivideInplaceTypeMapFactory
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using TrueDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
         argT,
         resT,
         TrueDivideInplaceFunctor<argT, resT>,
         vec_sz,
-        n_vecs>;
+        n_vecs,
+        enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using TrueDivideInplaceStridedFunctor =
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
index 0e08d966e9..35b0783719 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -77,13 +77,15 @@ template <typename argT, typename resT> struct TruncFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using TruncContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            TruncFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using TruncStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index 0410229a0a..35d7545f2d 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -33,6 +33,7 @@
 #include <vector>
 
 #include "elementwise_functions_type_utils.hpp"
+#include "kernels/elementwise_functions/alignment.hpp"
 #include "simplify_iteration_space.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/offset_utils.hpp"
@@ -48,6 +49,9 @@ namespace tensor
 namespace py_internal
 {
 
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 /*! @brief Template implementing Python API for unary elementwise functions */
 template <typename output_typesT,
           typename contig_dispatchT,
@@ -443,8 +447,6 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
     int nd = dst_nd;
     const py::ssize_t *shape = src1_shape;
 
-    // all args except itemsizes and is_?_contig bools can be modified by
-    // reference
     dpctl::tensor::py_internal::simplify_iteration_space_3(
         nd, shape, src1_strides, src2_strides, dst_strides,
         // outputs
@@ -487,16 +489,29 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                     contig_matrix_row_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
                 if (matrix_row_broadcast_fn != nullptr) {
-                    size_t n0 = simplified_shape[0];
-                    size_t n1 = simplified_shape[1];
-                    sycl::event comp_ev = matrix_row_broadcast_fn(
-                        exec_q, host_tasks, n0, n1, src1_data, src1_offset,
-                        src2_data, src2_offset, dst_data, dst_offset, depends);
-
-                    return std::make_pair(
-                        dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst},
-                                                      host_tasks),
-                        comp_ev);
+                    int src1_itemsize = src1.get_elemsize();
+                    int src2_itemsize = src2.get_elemsize();
+                    int dst_itemsize = dst.get_elemsize();
+
+                    if (is_aligned<required_alignment>(
+                            src1_data + src1_offset * src1_itemsize) &&
+                        is_aligned<required_alignment>(
+                            src2_data + src2_offset * src2_itemsize) &&
+                        is_aligned<required_alignment>(
+                            dst_data + dst_offset * dst_itemsize))
+                    {
+                        size_t n0 = simplified_shape[0];
+                        size_t n1 = simplified_shape[1];
+                        sycl::event comp_ev = matrix_row_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
+                            src2_data, src2_offset, dst_data, dst_offset,
+                            depends);
+
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(
+                                exec_q, {src1, src2, dst}, host_tasks),
+                            comp_ev);
+                    }
                 }
             }
             if (isEqual(simplified_src1_strides, one_zero_strides) &&
@@ -507,16 +522,30 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                     contig_row_matrix_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
                 if (row_matrix_broadcast_fn != nullptr) {
-                    size_t n0 = simplified_shape[1];
-                    size_t n1 = simplified_shape[0];
-                    sycl::event comp_ev = row_matrix_broadcast_fn(
-                        exec_q, host_tasks, n0, n1, src1_data, src1_offset,
-                        src2_data, src2_offset, dst_data, dst_offset, depends);
 
-                    return std::make_pair(
-                        dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst},
-                                                      host_tasks),
-                        comp_ev);
+                    int src1_itemsize = src1.get_elemsize();
+                    int src2_itemsize = src2.get_elemsize();
+                    int dst_itemsize = dst.get_elemsize();
+
+                    if (is_aligned<required_alignment>(
+                            src1_data + src1_offset * src1_itemsize) &&
+                        is_aligned<required_alignment>(
+                            src2_data + src2_offset * src2_itemsize) &&
+                        is_aligned<required_alignment>(
+                            dst_data + dst_offset * dst_itemsize))
+                    {
+                        size_t n0 = simplified_shape[1];
+                        size_t n1 = simplified_shape[0];
+                        sycl::event comp_ev = row_matrix_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
+                            src2_data, src2_offset, dst_data, dst_offset,
+                            depends);
+
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(
+                                exec_q, {src1, src2, dst}, host_tasks),
+                            comp_ev);
+                    }
                 }
             }
         }
@@ -736,8 +765,6 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs,
     int nd = lhs_nd;
     const py::ssize_t *shape = rhs_shape;
 
-    // all args except itemsizes and is_?_contig bools can be modified by
-    // reference
     dpctl::tensor::py_internal::simplify_iteration_space(
         nd, shape, rhs_strides, lhs_strides,
         // outputs

From 786bec7abfa7b035b22a94b18ddeddd7e45daf47 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 05:59:39 -0600
Subject: [PATCH 2/4] Added tests for abs/not_equal with not-aligned start of
 array

---
 dpctl/tests/elementwise/test_abs.py       | 15 +++++++++++++++
 dpctl/tests/elementwise/test_not_equal.py | 18 ++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
index 1db17c1b80..ca296911a0 100644
--- a/dpctl/tests/elementwise/test_abs.py
+++ b/dpctl/tests/elementwise/test_abs.py
@@ -187,3 +187,18 @@ def test_abs_complex_fp_special_values(dtype):
     tol = dpt.finfo(r.dtype).resolution
 
     assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_abs_alignment(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.ones(512, dtype=dtype)
+    r = dpt.abs(x)
+
+    r2 = dpt.abs(x[1:])
+    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
+
+    dpt.abs(x[:-1], out=r[1:])
+    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
diff --git a/dpctl/tests/elementwise/test_not_equal.py b/dpctl/tests/elementwise/test_not_equal.py
index 6baef3f145..222faccc39 100644
--- a/dpctl/tests/elementwise/test_not_equal.py
+++ b/dpctl/tests/elementwise/test_not_equal.py
@@ -188,3 +188,21 @@ def __sycl_usm_array_interface__(self):
     c = Canary()
     with pytest.raises(ValueError):
         dpt.not_equal(a, c)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_not_equal_alignment(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 256
+    s = dpt.concat((dpt.zeros(n, dtype=dtype), dpt.zeros(n, dtype=dtype)))
+
+    mask = s[:-1] != s[1:]
+    (pos,) = dpt.nonzero(mask)
+    assert dpt.all(pos == n)
+
+    out_arr = dpt.zeros(2 * n, dtype=mask.dtype)
+    dpt.not_equal(s[:-1], s[1:], out=out_arr[1:])
+    (pos,) = dpt.nonzero(mask)
+    assert dpt.all(pos == (n + 1))

From a307c008529a3802063ce67b1be5d71741661cdd Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 8 Dec 2023 15:48:15 -0800
Subject: [PATCH 3/4] Adds sub-group work-around to copy, clip, where

Moves alignment.hpp into dpctl/tensor/libtensor/include/kernels

Fixes a small typo in angle.hpp
---
 .../{elementwise_functions => }/alignment.hpp |  0
 .../tensor/libtensor/include/kernels/clip.hpp | 42 +++++++++++++---
 .../include/kernels/copy_and_cast.hpp         | 43 +++++++++++++---
 .../kernels/elementwise_functions/angle.hpp   |  4 +-
 .../kernels/elementwise_functions/common.hpp  |  2 +-
 .../elementwise_functions/common_inplace.hpp  |  2 +-
 .../libtensor/include/kernels/where.hpp       | 50 ++++++++++++++++---
 .../elementwise_functions.hpp                 |  2 +-
 8 files changed, 121 insertions(+), 24 deletions(-)
 rename dpctl/tensor/libtensor/include/kernels/{elementwise_functions => }/alignment.hpp (100%)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/alignment.hpp
rename to dpctl/tensor/libtensor/include/kernels/alignment.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp
index 9cca9f615b..aff1acb071 100644
--- a/dpctl/tensor/libtensor/include/kernels/clip.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp
@@ -32,6 +32,7 @@
 #include <pybind11/pybind11.h>
 #include <type_traits>
 
+#include "kernels/alignment.hpp"
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch.hpp"
@@ -51,6 +52,11 @@ namespace td_ns = dpctl::tensor::type_dispatch;
 
 using namespace dpctl::tensor::offset_utils;
 
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 template <typename T> T clip(const T &x, const T &min, const T &max)
 {
     using dpctl::tensor::type_utils::is_complex;
@@ -73,7 +79,11 @@ template <typename T> T clip(const T &x, const T &min, const T &max)
     }
 }
 
-template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
+template <typename T,
+          int vec_sz = 4,
+          int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+class ClipContigFunctor
 {
 private:
     size_t nelems = 0;
@@ -96,7 +106,7 @@ template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
     void operator()(sycl::nd_item<1> ndit) const
     {
         using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<T>::value) {
+        if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
             std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
 
@@ -195,10 +205,30 @@ sycl::event clip_contig_impl(sycl::queue &q,
         const auto gws_range = sycl::range<1>(n_groups * lws);
         const auto lws_range = sycl::range<1>(lws);
 
-        cgh.parallel_for<clip_contig_kernel<T, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            ClipContigFunctor<T, vec_sz, n_vecs>(nelems, x_tp, min_tp, max_tp,
-                                                 dst_tp));
+        if (is_aligned<required_alignment>(x_cp) &&
+            is_aligned<required_alignment>(min_cp) &&
+            is_aligned<required_alignment>(max_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>(
+                    nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>(
+                    nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
     });
 
     return clip_ev;
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index 9d1c788626..ef24b58ef2 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -29,6 +29,7 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "kernels/alignment.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_utils.hpp"
 
@@ -44,6 +45,11 @@ namespace copy_and_cast
 namespace py = pybind11;
 using namespace dpctl::tensor::offset_utils;
 
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 template <typename srcT, typename dstT, typename IndexerT>
 class copy_cast_generic_kernel;
 
@@ -200,7 +206,8 @@ template <typename srcT,
           typename dstT,
           typename CastFnT,
           int vec_sz = 4,
-          int n_vecs = 2>
+          int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 class ContigCopyFunctor
 {
 private:
@@ -219,7 +226,9 @@ class ContigCopyFunctor
         CastFnT fn{};
 
         using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<srcT>::value || is_complex<dstT>::value) {
+        if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
+                      is_complex<dstT>::value)
+        {
             std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
 
@@ -326,10 +335,32 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
         const auto gws_range = sycl::range<1>(n_groups * lws);
         const auto lws_range = sycl::range<1>(lws);
 
-        cgh.parallel_for<copy_cast_contig_kernel<srcTy, dstTy, n_vecs, vec_sz>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
-                              n_vecs>(nelems, src_tp, dst_tp));
+        if (is_aligned<required_alignment>(src_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, enable_sg_loadstore>(nelems, src_tp,
+                                                               dst_tp));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, disable_sg_loadstore>(nelems, src_tp,
+                                                                dst_tp));
+        }
     });
 
     return copy_and_cast_ev;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index 84b7104b47..2759974b93 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -81,14 +81,14 @@ template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
           unsigned int n_vecs = 2,
-          bool enable_sg_loadstire = true>
+          bool enable_sg_loadstore = true>
 using AngleContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AngleFunctor<argTy, resTy>,
                                            vec_sz,
                                            n_vecs,
-                                           enable_sg_loadstire>;
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AngleStridedFunctor = elementwise_common::
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 433bed8614..1794dbf721 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -29,7 +29,7 @@
 #include <sycl/sycl.hpp>
 #include <utility>
 
-#include "kernels/elementwise_functions/alignment.hpp"
+#include "kernels/alignment.hpp"
 #include "utils/offset_utils.hpp"
 
 namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 107baecf6c..deaef5522f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -29,7 +29,7 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "kernels/elementwise_functions/alignment.hpp"
+#include "kernels/alignment.hpp"
 
 namespace dpctl
 {
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
index 9558603d5e..a1c0c7cfb0 100644
--- a/dpctl/tensor/libtensor/include/kernels/where.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/where.hpp
@@ -25,8 +25,6 @@
 #pragma once
 #include "pybind11/numpy.h"
 #include "pybind11/stl.h"
-#include "utils/offset_utils.hpp"
-#include "utils/type_utils.hpp"
 #include <algorithm>
 #include <complex>
 #include <cstdint>
@@ -34,6 +32,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
 namespace dpctl
 {
 namespace tensor
@@ -47,12 +49,21 @@ namespace py = pybind11;
 
 using namespace dpctl::tensor::offset_utils;
 
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 template <typename T, typename condT, typename IndexerT>
 class where_strided_kernel;
 template <typename T, typename condT, int vec_sz, int n_vecs>
 class where_contig_kernel;
 
-template <typename T, typename condT, int vec_sz = 4, int n_vecs = 2>
+template <typename T,
+          typename condT,
+          int vec_sz = 4,
+          int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 class WhereContigFunctor
 {
 private:
@@ -76,7 +87,9 @@ class WhereContigFunctor
     void operator()(sycl::nd_item<1> ndit) const
     {
         using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<condT>::value || is_complex<T>::value) {
+        if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
+                      is_complex<T>::value)
+        {
             std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
 
@@ -175,10 +188,33 @@ sycl::event where_contig_impl(sycl::queue &q,
         const auto gws_range = sycl::range<1>(n_groups * lws);
         const auto lws_range = sycl::range<1>(lws);
 
-        cgh.parallel_for<where_contig_kernel<T, condT, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            WhereContigFunctor<T, condT, vec_sz, n_vecs>(nelems, cond_tp, x1_tp,
+        if (is_aligned<required_alignment>(cond_cp) &&
+            is_aligned<required_alignment>(x1_cp) &&
+            is_aligned<required_alignment>(x2_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   enable_sg_loadstore>(nelems, cond_tp, x1_tp,
+                                                        x2_tp, dst_tp));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                where_contig_kernel<T, condT, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   disable_sg_loadstore>(nelems, cond_tp, x1_tp,
                                                          x2_tp, dst_tp));
+        }
     });
 
     return where_ev;
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index 35d7545f2d..da0137fd5f 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -33,7 +33,7 @@
 #include <vector>
 
 #include "elementwise_functions_type_utils.hpp"
-#include "kernels/elementwise_functions/alignment.hpp"
+#include "kernels/alignment.hpp"
 #include "simplify_iteration_space.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/offset_utils.hpp"

From 86153f3a1d9ed32c63f72f3baa862304182baab5 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 8 Dec 2023 15:48:30 -0800
Subject: [PATCH 4/4] Adds tests for unaligned clip, where, copy

---
 dpctl/tests/test_tensor_clip.py                  | 11 +++++++++++
 dpctl/tests/test_usm_ndarray_ctor.py             |  9 +++++++++
 dpctl/tests/test_usm_ndarray_search_functions.py | 11 +++++++++++
 3 files changed, 31 insertions(+)

diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py
index 7050b17e7c..39ba35a4a1 100644
--- a/dpctl/tests/test_tensor_clip.py
+++ b/dpctl/tests/test_tensor_clip.py
@@ -625,3 +625,14 @@ def test_clip_max_weak_types():
 
     with pytest.raises(ValueError):
         dpt.clip(x, 2.5, m)
+
+
+def test_clip_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.full(513, 5, dtype="i4")
+    a_min = dpt.zeros(512, dtype="i4")
+    a_max = dpt.full(512, 2, dtype="i4")
+
+    expected = dpt.full(512, 2, dtype="i4")
+    assert dpt.all(dpt.clip(x[1:], a_min, a_max) == expected)
diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 5e3feb660d..7227e687af 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -1326,6 +1326,15 @@ def test_copy():
     assert np.array_equal(dpt.asnumpy(Yk), ref)
 
 
+def test_copy_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.ones(513, dtype="i4")
+    r = dpt.astype(x[1:], "f4")
+
+    assert dpt.all(r == 1)
+
+
 def test_ctor_invalid():
     try:
         m = dpm.MemoryUSMShared(12)
diff --git a/dpctl/tests/test_usm_ndarray_search_functions.py b/dpctl/tests/test_usm_ndarray_search_functions.py
index 74bd056729..dc62dd1951 100644
--- a/dpctl/tests/test_usm_ndarray_search_functions.py
+++ b/dpctl/tests/test_usm_ndarray_search_functions.py
@@ -412,3 +412,14 @@ def test_where_order():
         condition = dpt.zeros(n, dtype="?", order="C")
         res = dpt.where(condition, ar1, ar2)
         assert res.strides == (20, 1)
+
+
+def test_where_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.ones(513, dtype="i4")
+    a = dpt.full(512, 2, dtype="i4")
+    b = dpt.zeros(512, dtype="i4")
+
+    expected = dpt.full(512, 2, dtype="i4")
+    assert dpt.all(dpt.where(x[1:], a, b) == expected)