diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index f1435f0ccc..b6a4649a30 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -102,6 +102,8 @@ jobs:
         run: |
           echo "WHEELS_OUTPUT_FOLDER=$GITHUB_WORKSPACE${{ runner.os == 'Linux' && '/' || '\\' }}" >> $GITHUB_ENV
       - name: Build conda package
+        env:
+          OVERRIDE_INTEL_IPO: 1   # IPO requires more resources that GH actions VM provides
         run: conda build --no-test --python ${{ matrix.python }} -c intel -c conda-forge --override-channels conda-recipe
       - name: Upload artifact
         uses: actions/upload-artifact@v3
@@ -181,6 +183,7 @@ jobs:
           python -c "import dpctl; dpctl.lsplatform(verbosity=2)"
       - name: Install gdb
         run: |
+          sudo apt-get update --fix-missing
           sudo apt-get install -y gdb
       - name: Run test_elementwise under gdb
         run: |
@@ -320,15 +323,22 @@ jobs:
       matrix:
         python: ['3.9', '3.10', '3.11']
     steps:
-      - name: Download artifact
+      - name: Download conda artifact
         uses: actions/download-artifact@v3
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
 
+      - name: Download wheel artifact
+        uses: actions/download-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
+
       - name: Install anaconda-client
         run: conda install anaconda-client
       - name: Add conda to system path
         run: echo $CONDA/bin >> $GITHUB_PATH
+      - name: Package version
+        run: echo "PACKAGE_VERSION=$(basename ${{ env.PACKAGE_NAME }}-*.tar.bz2 | sed 's/^${{ env.PACKAGE_NAME }}-\([^-]*\).*/\1/')" >> $GITHUB_ENV
 
       - name: Upload
         env:
@@ -339,7 +349,7 @@ jobs:
       - name: Upload Wheels
         env:
           ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }}
-        run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl
+        run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl --version ${{ env.PACKAGE_VERSION }}
 
   upload_windows:
     needs: test_windows
@@ -353,13 +363,24 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+
+      - name: Download wheel artifact
+        uses: actions/download-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
+
       - uses: conda-incubator/setup-miniconda@v2
         with:
           auto-activate-base: true
           activate-environment: ""
+
       - name: Install anaconda-client
         run: conda install anaconda-client
 
+      - name: Package version
+        shell: bash -el {0}
+        run: echo "PACKAGE_VERSION=$(basename ${{ env.PACKAGE_NAME }}-*.tar.bz2 | sed 's/^${{ env.PACKAGE_NAME }}-\([^-]*\).*/\1/')" >> $GITHUB_ENV
+
       - name: Upload
         env:
           ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }}
@@ -369,7 +390,37 @@ jobs:
       - name: Upload Wheels
         env:
           ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }}
-        run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl
+        run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl --version ${{ env.PACKAGE_VERSION }}
+
+  cleanup_packages:
+    name: Clean up anaconda packages
+    needs: [upload_linux, upload_windows]
+    runs-on: 'ubuntu-latest'
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          run-post: false
+          channel-priority: "disabled"
+          channels: conda-forge
+          python-version: '3.11'
+
+      - name: Install anaconda-client
+        run: conda install anaconda-client
+
+      - name: Checkout repo
+        uses: actions/checkout@v2
+        with:
+          repository: IntelPython/devops-tools
+          fetch-depth: 0
+
+      - name: Cleanup old packages
+        run: |
+          python scripts/cleanup-old-packages.py \
+          --verbose --force --token ${{ secrets.ANACONDA_TOKEN }} \
+          --package dppy/${{ env.PACKAGE_NAME }} --label dev
 
   test_examples_linux:
     needs: build_linux
@@ -615,7 +666,7 @@ jobs:
           python -c "import dpctl; dpctl.lsplatform()"
           export ARRAY_API_TESTS_MODULE=dpctl.tensor
           cd /home/runner/work/array-api-tests
-          pytest --ci --json-report --json-report-file=$FILE array_api_tests/ || true
+          pytest --json-report --json-report-file=$FILE array_api_tests/ || true
       - name: Set Github environment variables
         shell: bash -l {0}
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b91813feda..eb53db12ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR)
 
 project(dpctl
+    VERSION 0.15
     LANGUAGES CXX
     DESCRIPTION "Python interface for XPU programming"
 )
@@ -17,7 +18,7 @@ option(DPCTL_GENERATE_COVERAGE
     OFF
 )
 
-find_package(IntelDPCPP REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
+find_package(IntelSYCL REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
 add_subdirectory(libsyclinterface)
 
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index d72914b30b..0000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,23 +0,0 @@
-recursive-include dpctl/include *.h
-recursive-include dpctl/include *.hpp
-include dpctl/include/dpctl4pybind11.hpp
-recursive-include dpctl *.pxd
-recursive-include dpctl *.cmake
-include dpctl/_sycl_context.h
-include dpctl/_sycl_context_api.h
-include dpctl/_sycl_device.h
-include dpctl/_sycl_device_api.h
-include dpctl/_sycl_queue.h
-include dpctl/_sycl_queue_api.h
-include dpctl/_sycl_event.h
-include dpctl/_sycl_event_api.h
-include dpctl/memory/_memory.h
-include dpctl/memory/_memory_api.h
-include dpctl/program/_program.h
-include dpctl/program/_program_api.h
-include dpctl/tensor/_usmarray.h
-include dpctl/tensor/_usmarray_api.h
-recursive-include dpctl/tensor/include *
-recursive-include dpctl/tensor/libtensor/include *
-include dpctl/tests/input_files/*
-include dpctl/tests/*.pyx
diff --git a/cmake/IntelDPCPPConfig.cmake b/cmake/IntelSYCLConfig.cmake
old mode 100644
new mode 100755
similarity index 64%
rename from cmake/IntelDPCPPConfig.cmake
rename to cmake/IntelSYCLConfig.cmake
index 37d79a3ec1..c51e47290c
--- a/cmake/IntelDPCPPConfig.cmake
+++ b/cmake/IntelSYCLConfig.cmake
@@ -1,5 +1,5 @@
 #
-# Modifications, Copyright (C) 2021 Intel Corporation
+# Modifications, Copyright (C) 2022 Intel Corporation
 #
 # This software and the related documents are Intel copyrighted materials, and
 # your use of them is governed by the express license under which they were
@@ -15,10 +15,10 @@
 # file Copyright.txt or https://cmake.org/licensing for details.
 
 #[=======================================================================[.rst:
-IntelDPCPPConfig
+IntelSYCLConfig
 -------
 
-DPCPP Library to verify DPCPP/SYCL compatability of CMAKE_CXX_COMPILER
+Library to verify SYCL compatability of CMAKE_CXX_COMPILER
 and passes relevant compiler flags.
 
 Result Variables
@@ -26,8 +26,8 @@ Result Variables
 
 This will define the following variables:
 
-``IntelDPCPP_FOUND``
-  True if the system has the DPCPP library.
+``IntelSYCL_FOUND``
+  True if the system has the SYCL library.
 ``SYCL_LANGUAGE_VERSION``
   The SYCL language spec version by Compiler.
 ``SYCL_INCLUDE_DIR``
@@ -37,35 +37,39 @@ This will define the following variables:
 ``SYCL_FLAGS``
   SYCL specific flags for the compiler.
 
+``IntelSYCL::SYCL_CXX``
+  Target for using Intel SYCL (DPC++).  The following properties are defined
+  for the target: ``INTERFACE_COMPILE_OPTIONS``, ``INTERFACE_LINK_OPTIONS``,
+  ``INTERFACE_INCLUDE_DIRECTORIES``, and ``INTERFACE_LINK_DIRECTORIES``
+
 Cache Variables
 ^^^^^^^^^^^^^^^
 
-The following cache variables may also be set:
+The following cache variable may also be set:
 
-``SYCL_INCLUDE_DIR``
-  The directory containing ``sycl.hpp``.
-``SYCL_LIBRARY_DIR``
-  The path to the SYCL library.
-``SYCL_FLAGS``
-  SYCL specific flags for the compiler.
 ``SYCL_LANGUAGE_VERSION``
   The SYCL language spec version by Compiler.
 
 
-.. note::
+.. Note::
 
-  For now, user needs to set -DCMAKE_CXX_COMPILER or environment of
+  1. User needs to set -DCMAKE_CXX_COMPILER or environment of
   CXX pointing to SYCL compatible compiler  ( eg: icx, clang++, icpx)
 
-  Note: do not set to DPCPP compiler. If set to a Compiler family
-  that supports dpcpp ( eg: IntelLLVM) both DPCPP and SYCL
-  features are enabled.
 
-  And add this package to user's Cmake config file.
+  2. Add this package to user's Cmake config file.
+
+  .. code-block:: cmake
+
+    find_package(IntelSYCL REQUIRED)
+
+  3. Add sources to target through add_sycl_to_target()
 
   .. code-block:: cmake
 
-    find_package(IntelDPCPP REQUIRED)
+     # Compile specific sources for SYCL and build target for SYCL
+     add_executable(target_proj A.cpp B.cpp offload1.cpp offload2.cpp)
+     add_sycl_to_target(TARGET target_proj SOURCES offload1.cpp offload2.cpp)
 
 #]=======================================================================]
 
@@ -83,25 +87,33 @@ endif()
 
 string(COMPARE EQUAL "${CMAKE_CXX_COMPILER}" "" nocmplr)
 if(nocmplr)
-  set(IntelDPCPP_FOUND False)
+  set(IntelSYCL_FOUND False)
   set(SYCL_REASON_FAILURE "SYCL: CMAKE_CXX_COMPILER not set!!")
-  set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+  set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+endif()
+
+# Check if a Compiler ID is being set. project() should be set prior to find_package()
+
+if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "x")
+   set(IntelSYCL_FOUND False)
+   set(SYCL_REASON_FAILURE "CMake CXX Compiler family is not set. Please make sure find_package(IntelSYCL) is called after project()!!")
+   set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+   return()
 endif()
 
 # Check for known compiler family that supports SYCL
 
 if( NOT "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" AND
     NOT "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM")
-   set(IntelDPCPP_FOUND False)
+   set(IntelSYCL_FOUND False)
    set(SYCL_REASON_FAILURE "Unsupported compiler family ${CMAKE_CXX_COMPILER_ID} and compiler ${CMAKE_CXX_COMPILER}!!")
-   set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+   set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
    return()
 endif()
 
 # Assume that CXX Compiler supports SYCL and then test to verify.
 set(SYCL_COMPILER ${CMAKE_CXX_COMPILER})
 
-
 # Function to write a test case to verify SYCL features.
 
 function(SYCL_FEATURE_TEST_WRITE src)
@@ -144,7 +156,7 @@ function(SYCL_FEATURE_TEST_BUILD TEST_SRC_FILE TEST_EXE)
     OUTPUT_VARIABLE output ERROR_VARIABLE output
     OUTPUT_FILE ${SYCL_TEST_DIR}/Compile.log
     RESULT_VARIABLE result
-    TIMEOUT 20
+    TIMEOUT 60
     )
 
   # Verify if test case build properly.
@@ -168,12 +180,12 @@ function(SYCL_FEATURE_TEST_RUN TEST_EXE)
     WORKING_DIRECTORY ${SYCL_TEST_DIR}
     OUTPUT_VARIABLE output ERROR_VARIABLE output
     RESULT_VARIABLE result
-    TIMEOUT 20
+    TIMEOUT 60
     )
 
   # Verify the test execution output.
   if(test_result)
-    set(IntelDPCPP_FOUND False)
+    set(IntelSYCL_FOUND False)
     set(SYCL_REASON_FAILURE "SYCL: feature test execution failed!!")
   endif()
   # TODO: what iff the result is false.. error or ignore?
@@ -236,14 +248,14 @@ set(SYCL_LINK_FLAGS "")
 # Based on Compiler ID, add support for SYCL
 if( "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" OR
     "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM")
-  set(SYCL_FLAGS "-fsycl ")
-  set(SYCL_LINK_FLAGS "-fsycl ")
+  list(APPEND SYCL_FLAGS "-fsycl")
+  list(APPEND SYCL_LINK_FLAGS "-fsycl")
 endif()
 
 # TODO verify if this is needed
 # Windows: Add Exception handling
 if(WIN32)
-  set(SYCL_FLAGS "${SYCL_FLAGS} /EHsc")
+  list(APPEND SYCL_FLAGS "/EHsc")
 endif()
 
 set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}")
@@ -273,32 +285,76 @@ SYCL_FEATURE_TEST_EXTRACT(${test_output})
 # define macro  SYCL_LANGUAGE_VERSION
 string(COMPARE EQUAL "${SYCL_LANGUAGE_VERSION}" "" nosycllang)
 if(nosycllang)
-  set(IntelDPCPP_FOUND False)
+  set(IntelSYCL_FOUND False)
   set(SYCL_REASON_FAILURE "SYCL: It appears that the ${CMAKE_CXX_COMPILER} does not support SYCL")
-  set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+  set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
 endif()
 
 # Placeholder for identifying various implemenations of SYCL compilers.
 # for now, set to the CMAKE_CXX_COMPILER_ID
 set(SYCL_IMPLEMENTATION_ID "${CMAKE_CXX_COMPILER_ID}")
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}")
-set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} ${SYCL_LINK_FLAGS}")
+message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}")
+message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}")
+message(DEBUG "The SYCL Language Version is ${SYCL_LANGUAGE_VERSION}")
 
-message(STATUS "Echo from ${CMAKE_CURRENT_SOURCE_DIR}/IntelDPCPPConfig.cmake")
-message(STATUS "The SYCL compiler is ${SYCL_COMPILER}")
-message(STATUS "The SYCL Flags are ${SYCL_FLAGS}")
-message(STATUS "The SYCL Language Version is ${SYCL_LANGUAGE_VERSION}")
+add_library(IntelSYCL::SYCL_CXX INTERFACE IMPORTED)
+set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY
+  INTERFACE_COMPILE_OPTIONS ${SYCL_FLAGS})
+set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY
+  INTERFACE_LINK_OPTIONS ${SYCL_LINK_FLAGS})
+set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY
+  INTERFACE_INCLUDE_DIRECTORIES ${SYCL_INCLUDE_DIR})
+set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY
+  INTERFACE_LINK_DIRECTORIES ${SYCL_LIBRARY_DIR})
 
 find_package_handle_standard_args(
-  IntelDPCPP
-  FOUND_VAR IntelDPCPP_FOUND
+  IntelSYCL
+  FOUND_VAR IntelSYCL_FOUND
   REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_FLAGS
   VERSION_VAR SYCL_LANGUAGE_VERSION
   REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}")
 
 # Include in Cache
 set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" CACHE STRING "SYCL Language version")
-set(SYCL_INCLUDE_DIR "${SYCL_INCLUDE_DIR}" CACHE FILEPATH "SYCL Include directory")
-set(SYCL_LIBRARY_DIR "${SYCL_LIBRARY_DIR}" CACHE FILEPATH "SYCL Library Directory")
-set(SYCL_FLAGS "${SYCL_FLAGS}" CACHE STRING "SYCL flags for the compiler")
+
+function(add_sycl_to_target)
+
+  set(one_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(SYCL
+    ""
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN})
+
+
+    get_target_property(__sycl_cxx_options IntelSYCL::SYCL_CXX INTERFACE_COMPILE_OPTIONS)
+    get_target_property(__sycl_cxx_include_directories IntelSYCL::SYCL_CXX INTERFACE_INCLUDE_DIRECTORIES)
+
+    if(NOT ${ARGC})
+      message(FATAL_ERROR " add_sycl_to_target() does not have any arguments")
+    elseif(${ARGC} EQUAL 1)
+      message(WARNING "add_sycl_to_target() have only one argument specified.. assuming the target to be ${ARGV}.
+Adding sycl to all sources but that may effect compilation times")
+      set(SYCL_TARGET ${ARGV})
+    endif()
+
+    if(NOT SYCL_SOURCES)
+      message(WARNING "add_sycl_to_target() does not have sources specified.. Adding sycl to all sources but that may effect compilation times")
+      target_compile_options(${SYCL_TARGET} PUBLIC ${__sycl_cxx_options})
+      target_include_directories(${SYCL_TARGET} PUBLIC ${__sycl_cxx_include_directories})
+    endif()
+
+    foreach(source ${SYCL_SOURCES})
+      set_source_files_properties(${source} PROPERTIES COMPILE_OPTIONS "${__sycl_cxx_options}")
+      set_source_files_properties(${source} PROPERTIES INCLUDE_DIRECTORIES "${__sycl_cxx_include_directories}")
+    endforeach()
+
+    get_target_property(__sycl_link_options
+        IntelSYCL::SYCL_CXX INTERFACE_LINK_OPTIONS)
+    target_link_options(${SYCL_TARGET} PRIVATE "${__sycl_link_options}")
+    get_target_property(__sycl_link_directories
+        IntelSYCL::SYCL_CXX INTERFACE_LINK_DIRECTORIES)
+    target_link_directories(${SYCL_TARGET} PUBLIC "${__sycl_link_directories}")
+endfunction(add_sycl_to_target)
diff --git a/conda-recipe/bld.bat b/conda-recipe/bld.bat
index 07fa580bb4..e87254b446 100644
--- a/conda-recipe/bld.bat
+++ b/conda-recipe/bld.bat
@@ -6,6 +6,11 @@ set "INCLUDE=%BUILD_PREFIX%\include;%INCLUDE%"
 "%PYTHON%" setup.py clean --all
 set "SKBUILD_ARGS=-G Ninja -- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
 
+REM Overriding IPO is useful for building in resources constrained VMs (public CI)
+if DEFINED OVERRIDE_INTEL_IPO (
+   set "SKBUILD_ARGS=%SKBUILD_ARGS% -DCMAKE_INTERPROCEDURAL_OPTIMIZATION:BOOL=FALSE"
+)
+
 FOR %%V IN (14.0.0 14 15.0.0 15 16.0.0 16 17.0.0 17) DO @(
   REM set DIR_HINT if directory exists
   IF EXIST "%BUILD_PREFIX%\Library\lib\clang\%%V\" (
@@ -29,7 +34,7 @@ if EXIST "%PLATFORM_DIR%" (
 
 if NOT "%WHEELS_OUTPUT_FOLDER%"=="" (
     rem Install and assemble wheel package from the build bits
-    "%PYTHON%" setup.py install bdist_wheel %SKBUILD_ARGS%
+    "%PYTHON%" setup.py install bdist_wheel --build-number %GIT_DESCRIBE_NUMBER% %SKBUILD_ARGS%
     if errorlevel 1 exit 1
     copy dist\dpctl*.whl %WHEELS_OUTPUT_FOLDER%
     if errorlevel 1 exit 1
diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh
index 87155d3fb3..f21660ec50 100755
--- a/conda-recipe/build.sh
+++ b/conda-recipe/build.sh
@@ -17,11 +17,7 @@ echo "${PYTHON} setup.py install ${SKBUILD_ARGS}"
 
 if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
     # Install packages and assemble wheel package from built bits
-    if [ "$CONDA_PY" == "36" ]; then
-        WHEELS_BUILD_ARGS="-p manylinux1_x86_64"
-    else
-        WHEELS_BUILD_ARGS="-p manylinux2014_x86_64"
-    fi
+    WHEELS_BUILD_ARGS="-p manylinux2014_x86_64 --build-number ${GIT_DESCRIBE_NUMBER}"
     ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
     cp dist/dpctl*.whl ${WHEELS_OUTPUT_FOLDER}
 else
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 7b6f071610..2806fb9262 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -9,6 +9,7 @@ build:
     number: {{ GIT_DESCRIBE_NUMBER }}
     script_env:
         - WHEELS_OUTPUT_FOLDER
+        - OVERRIDE_INTEL_IPO  # [win]
 
 requirements:
     build:
diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
index a466d3eef1..cb872ff45f 100644
--- a/dpctl/CMakeLists.txt
+++ b/dpctl/CMakeLists.txt
@@ -58,7 +58,6 @@ elseif(UNIX)
     string(CONCAT CXXFLAGS
         "${WARNING_FLAGS}"
         "${SDL_FLAGS}"
-        "-fsycl "
     )
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}")
@@ -137,10 +136,15 @@ add_custom_target(
 set(CMAKE_INSTALL_RPATH "$ORIGIN")
 
 function(build_dpctl_ext _trgt _src _dest)
+    set(options SYCL)
+    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "" "" ${ARGN})
     add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
     set(_cythonize_trgt "${_trgt}_cythonize_pyx")
     add_custom_target(${_cythonize_trgt} DEPENDS ${_src})
-    python_add_library(${_trgt} MODULE ${_generated_src})
+    Python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
+    if (BUILD_DPCTL_EXT_SYCL)
+      add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+    endif()
     target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR})
     add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt})
     if (DPCTL_GENERATE_COVERAGE)
@@ -185,13 +189,18 @@ function(build_dpctl_ext _trgt _src _dest)
     install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
 endfunction()
 
-file(GLOB _cython_sources *.pyx)
+file(GLOB _cython_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.pyx)
+list(REMOVE_ITEM _cython_sources ${CMAKE_CURRENT_SOURCE_DIR}/_sycl_queue.pyx)
 foreach(_cy_file ${_cython_sources})
     get_filename_component(_trgt ${_cy_file} NAME_WLE)
     build_dpctl_ext(${_trgt} ${_cy_file} "dpctl")
 endforeach()
 
-target_include_directories(_sycl_queue PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+set(_cy_file ${CMAKE_CURRENT_SOURCE_DIR}/_sycl_queue.pyx)
+get_filename_component(_trgt ${_cy_file} NAME_WLE)
+build_dpctl_ext(${_trgt} ${_cy_file} "dpctl" SYCL)
+# _sycl_queue include _host_task_util.hpp
+target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_subdirectory(program)
 add_subdirectory(memory)
diff --git a/dpctl/_backend.pxd b/dpctl/_backend.pxd
index 3f7ba63a55..57da77eb7d 100644
--- a/dpctl/_backend.pxd
+++ b/dpctl/_backend.pxd
@@ -403,6 +403,13 @@ cdef extern from "syclinterface/dpctl_sycl_queue_interface.h":
         void *Dest,
         const void *Src,
         size_t Count)
+    cdef DPCTLSyclEventRef DPCTLQueue_MemcpyWithEvents(
+        const DPCTLSyclQueueRef Q,
+        void *Dest,
+        const void *Src,
+        size_t Count,
+        const DPCTLSyclEventRef *depEvents,
+        size_t depEventsCount)
     cdef DPCTLSyclEventRef DPCTLQueue_Memset(
         const DPCTLSyclQueueRef Q,
         void *Dest,
diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp
index 8db17594fd..cb3828a54f 100644
--- a/dpctl/_host_task_util.hpp
+++ b/dpctl/_host_task_util.hpp
@@ -2,7 +2,7 @@
 //
 //                      Data Parallel Control (dpctl)
 //
-// Copyright 2020-2022 Intel Corporation
+// Copyright 2020-2023 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -29,30 +29,30 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#pragma once
 #include "Python.h"
 #include "syclinterface/dpctl_data_types.h"
+#include "syclinterface/dpctl_sycl_type_casters.hpp"
 #include <CL/sycl.hpp>
 
-int async_dec_ref(DPCTLSyclQueueRef QRef,
-                  PyObject **obj_array,
-                  size_t obj_array_size,
-                  DPCTLSyclEventRef *ERefs,
-                  size_t nERefs)
+DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef,
+                                PyObject **obj_array,
+                                size_t obj_array_size,
+                                DPCTLSyclEventRef *depERefs,
+                                size_t nDepERefs,
+                                int *status)
 {
+    using dpctl::syclinterface::unwrap;
+    using dpctl::syclinterface::wrap;
 
-    sycl::queue *q = reinterpret_cast<sycl::queue *>(QRef);
+    sycl::queue *q = unwrap<sycl::queue>(QRef);
 
-    std::vector<PyObject *> obj_vec;
-    obj_vec.reserve(obj_array_size);
-    for (size_t obj_id = 0; obj_id < obj_array_size; ++obj_id) {
-        obj_vec.push_back(obj_array[obj_id]);
-    }
+    std::vector<PyObject *> obj_vec(obj_array, obj_array + obj_array_size);
 
     try {
-        q->submit([&](sycl::handler &cgh) {
-            for (size_t ev_id = 0; ev_id < nERefs; ++ev_id) {
-                cgh.depends_on(
-                    *(reinterpret_cast<sycl::event *>(ERefs[ev_id])));
+        sycl::event ht_ev = q->submit([&](sycl::handler &cgh) {
+            for (size_t ev_id = 0; ev_id < nDepERefs; ++ev_id) {
+                cgh.depends_on(*(unwrap<sycl::event>(depERefs[ev_id])));
             }
             cgh.host_task([obj_array_size, obj_vec]() {
                 // if the main thread has not finilized the interpreter yet
@@ -66,9 +66,21 @@ int async_dec_ref(DPCTLSyclQueueRef QRef,
                 }
             });
         });
+
+        constexpr int result_ok = 0;
+
+        *status = result_ok;
+        auto e_ptr = new sycl::event(ht_ev);
+        return wrap<sycl::event>(e_ptr);
     } catch (const std::exception &e) {
-        return 1;
+        constexpr int result_std_exception = 1;
+
+        *status = result_std_exception;
+        return nullptr;
     }
 
-    return 0;
+    constexpr int result_other_abnormal = 2;
+
+    *status = result_other_abnormal;
+    return nullptr;
 }
diff --git a/dpctl/_sycl_context.pyx b/dpctl/_sycl_context.pyx
index 71c2fb7591..a9873fa0de 100644
--- a/dpctl/_sycl_context.pyx
+++ b/dpctl/_sycl_context.pyx
@@ -48,7 +48,6 @@ from ._backend cimport (  # noqa: E211
     error_handler_callback,
 )
 from ._sycl_device cimport SyclDevice
-from ._sycl_queue cimport default_async_error_handler
 from ._sycl_device import SyclDeviceCreationError
 
 __all__ = [
@@ -201,8 +200,7 @@ cdef class SyclContext(_SyclContext):
     cdef int _init_context_from_one_device(self, SyclDevice device, int props):
         cdef DPCTLSyclDeviceRef DRef = device.get_device_ref()
         cdef DPCTLSyclContextRef CRef = NULL
-        cdef error_handler_callback * eh_callback = (
-            <error_handler_callback *>&default_async_error_handler)
+        cdef error_handler_callback * eh_callback = NULL
         # look up cached contexts for root devices first
         CRef = DPCTLDeviceMgr_GetCachedContext(DRef)
         if (CRef is NULL):
@@ -219,8 +217,7 @@ cdef class SyclContext(_SyclContext):
         cdef int j = 0
         cdef size_t num_bytes
         cdef DPCTLDeviceVectorRef DVRef = NULL
-        cdef error_handler_callback * eh_callback = (
-            <error_handler_callback *>&default_async_error_handler)
+        cdef error_handler_callback * eh_callback = NULL
         cdef DPCTLSyclContextRef CRef = NULL
         cdef DPCTLSyclDeviceRef *elems
 
diff --git a/dpctl/_sycl_event.pyx b/dpctl/_sycl_event.pyx
index 34576a2ef7..3b8f89d4c5 100644
--- a/dpctl/_sycl_event.pyx
+++ b/dpctl/_sycl_event.pyx
@@ -218,7 +218,7 @@ cdef class SyclEvent(_SyclEvent):
 
     @staticmethod
     cdef void _wait(SyclEvent event):
-        with nogil: DPCTLEvent_WaitAndThrow(event._event_ref)
+        with nogil: DPCTLEvent_Wait(event._event_ref)
 
     @staticmethod
     def wait_for(event):
diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd
index c906ada4d6..0269cc4aae 100644
--- a/dpctl/_sycl_queue.pxd
+++ b/dpctl/_sycl_queue.pxd
@@ -29,8 +29,6 @@ from ._sycl_event cimport SyclEvent
 from .program._program cimport SyclKernel
 
 
-cdef void default_async_error_handler(int) except * nogil
-
 cdef public api class _SyclQueue [
     object Py_SyclQueueObject, type Py_SyclQueueType
 ]:
@@ -72,6 +70,19 @@ cdef public api class SyclQueue (_SyclQueue) [
     cpdef SyclContext get_sycl_context(self)
     cpdef SyclDevice get_sycl_device(self)
     cdef  DPCTLSyclQueueRef get_queue_ref(self)
+    cpdef SyclEvent _submit_keep_args_alive(
+        self,
+        object args,
+        list dEvents
+    )
+    cpdef SyclEvent submit_async(
+        self,
+        SyclKernel kernel,
+        list args,
+        list gS,
+        list lS=*,
+        list dEvents=*
+    )
     cpdef SyclEvent submit(
         self,
         SyclKernel kernel,
@@ -83,6 +94,7 @@ cdef public api class SyclQueue (_SyclQueue) [
     cpdef void wait(self)
     cdef DPCTLSyclQueueRef get_queue_ref(self)
     cpdef memcpy(self, dest, src, size_t count)
+    cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=*)
     cpdef prefetch(self, ptr, size_t count=*)
     cpdef mem_advise(self, ptr, size_t count, int mem)
     cpdef SyclEvent submit_barrier(self, dependent_events=*)
diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx
index 6acf3396e1..a27e6f940f 100644
--- a/dpctl/_sycl_queue.pyx
+++ b/dpctl/_sycl_queue.pyx
@@ -45,6 +45,7 @@ from ._backend cimport (  # noqa: E211
     DPCTLQueue_IsInOrder,
     DPCTLQueue_MemAdvise,
     DPCTLQueue_Memcpy,
+    DPCTLQueue_MemcpyWithEvents,
     DPCTLQueue_Prefetch,
     DPCTLQueue_SubmitBarrierForEvents,
     DPCTLQueue_SubmitNDRange,
@@ -56,7 +57,6 @@ from ._backend cimport (  # noqa: E211
     _arg_data_type,
     _backend_type,
     _queue_property_type,
-    error_handler_callback,
 )
 from .memory._memory cimport _Memory
 
@@ -65,6 +65,7 @@ import ctypes
 from .enum_types import backend_type
 
 from cpython cimport pycapsule
+from cpython.buffer cimport PyObject_CheckBuffer
 from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject
 from libc.stdlib cimport free, malloc
 
@@ -73,7 +74,7 @@ import logging
 
 
 cdef extern from "_host_task_util.hpp":
-    int async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t) nogil
+    DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t, int *) nogil
 
 
 __all__ = [
@@ -114,18 +115,6 @@ cdef class SyclQueueCreationError(Exception):
     pass
 
 
-cdef class SyclAsynchronousError(Exception):
-    """
-    A SyclAsynchronousError exception is raised when SYCL operation submission
-    or execution encounters an error.
-    """
-
-
-cdef void default_async_error_handler(int err) except * nogil:
-    with gil:
-        raise SyclAsynchronousError(err)
-
-
 cdef int _parse_queue_properties(object prop) except *:
     cdef int res = 0
     cdef object props
@@ -173,6 +162,62 @@ cdef void _queue_capsule_deleter(object o) noexcept:
         DPCTLQueue_Delete(QRef)
 
 
+cdef bint _is_buffer(object o):
+    return PyObject_CheckBuffer(o)
+
+
+cdef DPCTLSyclEventRef _memcpy_impl(
+     SyclQueue q,
+     object dst,
+     object src,
+     size_t byte_count,
+     DPCTLSyclEventRef *dep_events,
+     size_t dep_events_count
+) except *:
+    cdef void *c_dst_ptr = NULL
+    cdef void *c_src_ptr = NULL
+    cdef DPCTLSyclEventRef ERef = NULL
+    cdef const unsigned char[::1] src_host_buf = None
+    cdef unsigned char[::1] dst_host_buf = None
+
+    if isinstance(src, _Memory):
+        c_src_ptr = <void*>(<_Memory>src).memory_ptr
+    elif _is_buffer(src):
+        src_host_buf = src
+        c_src_ptr = <void *>&src_host_buf[0]
+    else:
+        raise TypeError(
+             "Parameter `src` should have either type "
+             "`dpctl.memory._Memory` or a type that "
+             "supports Python buffer protocol"
+       )
+
+    if isinstance(dst, _Memory):
+        c_dst_ptr = <void*>(<_Memory>dst).memory_ptr
+    elif _is_buffer(dst):
+        dst_host_buf = dst
+        c_dst_ptr = <void *>&dst_host_buf[0]
+    else:
+        raise TypeError(
+             "Parameter `dst` should have either type "
+             "`dpctl.memory._Memory` or a type that "
+             "supports Python buffer protocol"
+       )
+
+    if dep_events_count == 0 or dep_events is NULL:
+        ERef = DPCTLQueue_Memcpy(q._queue_ref, c_dst_ptr, c_src_ptr, byte_count)
+    else:
+        ERef = DPCTLQueue_MemcpyWithEvents(
+            q._queue_ref,
+            c_dst_ptr,
+            c_src_ptr,
+            byte_count,
+            dep_events,
+            dep_events_count
+        )
+    return ERef
+
+
 cdef class _SyclQueue:
     """ Barebone data owner class used by SyclQueue.
     """
@@ -404,7 +449,7 @@ cdef class SyclQueue(_SyclQueue):
         QRef = DPCTLQueue_Create(
             CRef,
             DRef,
-            <error_handler_callback *>&default_async_error_handler,
+            NULL,
             props
         )
         if QRef is NULL:
@@ -481,7 +526,7 @@ cdef class SyclQueue(_SyclQueue):
         QRef = DPCTLQueue_Create(
             CRef,
             DRef,
-            <error_handler_callback *>&default_async_error_handler,
+            NULL,
             props
         )
         if (QRef is NULL):
@@ -566,7 +611,7 @@ cdef class SyclQueue(_SyclQueue):
         qref = DPCTLQueue_Create(
             cref,
             dref,
-            <error_handler_callback *>&default_async_error_handler,
+            NULL,
             props
         )
         if qref is NULL:
@@ -716,7 +761,81 @@ cdef class SyclQueue(_SyclQueue):
         """
         return <size_t>self._queue_ref
 
-    cpdef SyclEvent submit(
+
+    cpdef SyclEvent _submit_keep_args_alive(
+        self,
+        object args,
+        list dEvents
+    ):
+        """ SyclQueue._submit_keep_args_alive(args, events)
+
+        Keeps objects in `args` alive until tasks associated with events
+        complete.
+
+        Args:
+            args(object): Python object to keep alive.
+               Typically a tuple with arguments to offloaded tasks
+            events(Tuple[dpctl.SyclEvent]): Gating events
+               The list or tuple of events associated with tasks
+               working on Python objects collected in `args`.
+        Returns:
+            dpctl.SyclEvent
+               The event associated with the submission of host task.
+
+        Increments reference count of `args` and schedules asynchronous
+        ``host_task`` to decrement the count once dependent events are
+        complete.
+
+        N.B.: The `host_task` attempts to acquire Python GIL, and it is
+        known to be unsafe during interpreter shudown sequence. It is
+        thus strongly advised to ensure that all submitted `host_task`
+        complete before the end of the Python script.
+        """
+        cdef size_t nDE = len(dEvents)
+        cdef DPCTLSyclEventRef *depEvents = NULL
+        cdef PyObject *args_raw = NULL
+        cdef DPCTLSyclEventRef htERef = NULL
+        cdef int status = -1
+
+        # Create the array of dependent events if any
+        if nDE > 0:
+            depEvents = (
+                <DPCTLSyclEventRef*>malloc(nDE*sizeof(DPCTLSyclEventRef))
+            )
+            if not depEvents:
+                raise MemoryError()
+            else:
+                for idx, de in enumerate(dEvents):
+                    if isinstance(de, SyclEvent):
+                        depEvents[idx] = (<SyclEvent>de).get_event_ref()
+                    else:
+                        free(depEvents)
+                        raise TypeError(
+                            "A sequence of dpctl.SyclEvent is expected"
+                        )
+
+        # increment reference counts to list of arguments
+        Py_INCREF(args)
+
+        # schedule decrement
+        args_raw = <PyObject *>args
+
+        htERef = async_dec_ref(
+            self.get_queue_ref(),
+            &args_raw, 1,
+            depEvents, nDE, &status
+        )
+
+        free(depEvents)
+        if (status != 0):
+            with nogil: DPCTLEvent_Wait(htERef)
+            DPCTLEvent_Delete(htERef)
+            raise RuntimeError("Could not submit keep_args_alive host_task")
+
+        return SyclEvent._create(htERef)
+
+
+    cpdef SyclEvent submit_async(
         self,
         SyclKernel kernel,
         list args,
@@ -728,13 +847,14 @@ cdef class SyclQueue(_SyclQueue):
         cdef _arg_data_type *kargty = NULL
         cdef DPCTLSyclEventRef *depEvents = NULL
         cdef DPCTLSyclEventRef Eref = NULL
+        cdef DPCTLSyclEventRef htEref = NULL
         cdef int ret = 0
         cdef size_t gRange[3]
         cdef size_t lRange[3]
         cdef size_t nGS = len(gS)
         cdef size_t nLS = len(lS) if lS is not None else 0
         cdef size_t nDE = len(dEvents) if dEvents is not None else 0
-        cdef PyObject **arg_objects = NULL
+        cdef PyObject *args_raw = NULL
         cdef ssize_t i = 0
 
         # Allocate the arrays to be sent to DPCTLQueue_Submit
@@ -758,7 +878,15 @@ cdef class SyclQueue(_SyclQueue):
                 raise MemoryError()
             else:
                 for idx, de in enumerate(dEvents):
-                    depEvents[idx] = (<SyclEvent>de).get_event_ref()
+                    if isinstance(de, SyclEvent):
+                        depEvents[idx] = (<SyclEvent>de).get_event_ref()
+                    else:
+                        free(kargs)
+                        free(kargty)
+                        free(depEvents)
+                        raise TypeError(
+                            "A sequence of dpctl.SyclEvent is expected"
+                        )
 
         # populate the args and argstype arrays
         ret = self._populate_args(args, kargs, kargty)
@@ -836,50 +964,69 @@ cdef class SyclQueue(_SyclQueue):
             raise SyclKernelSubmitError(
                 "Kernel submission to Sycl queue failed."
             )
-        # increment reference counts to each argument
-        arg_objects = <PyObject **>malloc(len(args) * sizeof(PyObject *))
-        for i in range(len(args)):
-            arg_objects[i] = <PyObject *>(args[i])
-            Py_INCREF(<object> arg_objects[i])
-
-        # schedule decrement
-        if async_dec_ref(self.get_queue_ref(), arg_objects, len(args), &Eref, 1):
-            # async task submission failed, decrement ref counts and wait
-            for i in range(len(args)):
-                arg_objects[i] = <PyObject *>(args[i])
-                Py_DECREF(<object> arg_objects[i])
-            with nogil: DPCTLEvent_Wait(Eref)
-
-        # free memory
-        free(arg_objects)
 
         return SyclEvent._create(Eref)
 
+    cpdef SyclEvent submit(
+        self,
+        SyclKernel kernel,
+        list args,
+        list gS,
+        list lS=None,
+        list dEvents=None
+    ):
+        cdef SyclEvent e = self.submit_async(kernel, args, gS, lS, dEvents)
+        e.wait()
+        return e
+
     cpdef void wait(self):
         with nogil: DPCTLQueue_Wait(self._queue_ref)
 
     cpdef memcpy(self, dest, src, size_t count):
-        cdef void *c_dest
-        cdef void *c_src
+        """Copy memory from `src` to `dst`"""
         cdef DPCTLSyclEventRef ERef = NULL
 
-        if isinstance(dest, _Memory):
-            c_dest = <void*>(<_Memory>dest).memory_ptr
-        else:
-            raise TypeError("Parameter `dest` should have type _Memory.")
+        ERef = _memcpy_impl(<SyclQueue>self, dest, src, count, NULL, 0)
+        if (ERef is NULL):
+            raise RuntimeError(
+                "SyclQueue.memcpy operation encountered an error"
+            )
+        with nogil: DPCTLEvent_Wait(ERef)
+        DPCTLEvent_Delete(ERef)
+
+    cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=None):
+        """Copy memory from `src` to `dst`"""
+        cdef DPCTLSyclEventRef ERef = NULL
+        cdef DPCTLSyclEventRef *depEvents = NULL
+        cdef size_t nDE = 0
 
-        if isinstance(src, _Memory):
-            c_src = <void*>(<_Memory>src).memory_ptr
+        if dEvents is None:
+            ERef = _memcpy_impl(<SyclQueue>self, dest, src, count, NULL, 0)
         else:
-            raise TypeError("Parameter `src` should have type _Memory.")
+            nDE = len(dEvents)
+            depEvents = (
+                <DPCTLSyclEventRef*>malloc(nDE*sizeof(DPCTLSyclEventRef))
+            )
+            if depEvents is NULL:
+                raise MemoryError()
+            else:
+                for idx, de in enumerate(dEvents):
+                    if isinstance(de, SyclEvent):
+                        depEvents[idx] = (<SyclEvent>de).get_event_ref()
+                    else:
+                        free(depEvents)
+                        raise TypeError(
+                            "A sequence of dpctl.SyclEvent is expected"
+                        )
+            ERef = _memcpy_impl(self, dest, src, count, depEvents, nDE)
+            free(depEvents)
 
-        ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count)
         if (ERef is NULL):
             raise RuntimeError(
                 "SyclQueue.memcpy operation encountered an error"
             )
-        with nogil: DPCTLEvent_Wait(ERef)
-        DPCTLEvent_Delete(ERef)
+
+        return SyclEvent._create(ERef)
 
     cpdef prefetch(self, mem, size_t count=0):
         cdef void *ptr
diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py
index 322272df2d..66dd4f9340 100644
--- a/dpctl/_sycl_timer.py
+++ b/dpctl/_sycl_timer.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import timeit
 
 from . import SyclQueue
@@ -22,6 +21,29 @@
 __doc__ = "This module implements :class:`dpctl.SyclTimer`."
 
 
+class HostDeviceDuration:
+    def __init__(self, host_dt, device_dt):
+        self._host_dt = host_dt
+        self._device_dt = device_dt
+
+    def __repr__(self):
+        return f"(host_dt={self._host_dt}, device_dt={self._device_dt})"
+
+    def __str__(self):
+        return f"(host_dt={self._host_dt}, device_dt={self._device_dt})"
+
+    def __iter__(self):
+        yield from [self._host_dt, self._device_dt]
+
+    @property
+    def host_dt(self):
+        return self._host_dt
+
+    @property
+    def device_dt(self):
+        return self._device_dt
+
+
 class SyclTimer:
     """
     SyclTimer(host_timer=timeit.default_timer, time_scale=1)
@@ -45,7 +67,7 @@ class SyclTimer:
                 code_block
 
             # retrieve elapsed times in milliseconds
-            sycl_dt, wall_dt = timer.dt
+            wall_dt, device_dt = timer.dt
 
     Remark:
         The timer submits barriers to the queue at the entrance and the
@@ -67,10 +89,8 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1):
         self.timer = host_timer
         self.time_scale = time_scale
         self.queue = None
-        self.host_start = None
-        self.host_finish = None
-        self.event_start = None
-        self.event_finish = None
+        self.host_times = []
+        self.bracketing_events = []
 
     def __call__(self, queue=None):
         if isinstance(queue, SyclQueue):
@@ -89,27 +109,31 @@ def __call__(self, queue=None):
         return self
 
     def __enter__(self):
-        self.event_start = self.queue.submit_barrier()
-        self.host_start = self.timer()
+        self._event_start = self.queue.submit_barrier()
+        self._host_start = self.timer()
         return self
 
     def __exit__(self, *args):
-        self.event_finish = self.queue.submit_barrier()
-        self.host_finish = self.timer()
+        self.host_times.append((self._host_start, self.timer()))
+        self.bracketing_events.append(
+            (self._event_start, self.queue.submit_barrier())
+        )
+        del self._event_start
+        del self._host_start
 
     @property
     def dt(self):
-        """Returns a tuple of elapsed times where first
-        element is the duration as measured by the host timer,
-        while the second element is the duration as measured by
-        the device timer and encoded in profiling events"""
-        self.event_start.wait()
-        self.event_finish.wait()
-        return (
-            (self.host_finish - self.host_start) * self.time_scale,
-            (
-                self.event_finish.profiling_info_start
-                - self.event_start.profiling_info_end
-            )
-            * (1e-9 * self.time_scale),
-        )
+        """Returns a pair of elapsed times (host_dt, device_dt).
+
+        The host_dt is the duration as measured by the host
+        timer, while the device_dt is the duration as measured by
+        the device timer and encoded in profiling events."""
+        for es, ef in self.bracketing_events:
+            es.wait()
+            ef.wait()
+        host_dt = sum(tf - ts for ts, tf in self.host_times) * self.time_scale
+        dev_dt = sum(
+            ef.profiling_info_start - es.profiling_info_end
+            for es, ef in self.bracketing_events
+        ) * (1e-9 * self.time_scale)
+        return HostDeviceDuration(host_dt, dev_dt)
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 456eebdbaa..d1de208805 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -30,11 +30,96 @@ if(WIN32)
     endif()
 endif()
 
-set(python_module_name _tensor_impl)
-pybind11_add_module(${python_module_name} MODULE
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+set(_elementwise_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
+)
+set(_reduction_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
+)
+set(_boolean_reduction_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
+)
+set(_tensor_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
@@ -46,39 +131,84 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
+set(_tensor_elementwise_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${_elementwise_sources}
+)
+set(_tensor_reductions_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${_boolean_reduction_sources}
+    ${_reduction_sources}
+)
+
+set(_py_trgts)
+
+set(python_module_name _tensor_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_elementwise_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_reductions_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
 endif()
-set_source_files_properties(
+
+set(_no_fast_math_sources
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
-  PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math")
+  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+)
+list(APPEND _no_fast_math_sources
+     ${_elementwise_sources}
+     ${_reduction_sources}
+)
+
+foreach(_src_fn ${_no_fast_math_sources})
+  get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
+  set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math")
+  set_source_files_properties(
+     ${_src_fn}
+     PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}"
+  )
+endforeach()
 if (UNIX)
   set_source_files_properties(
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
     PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
 endif()
-target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
-target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
-if(UNIX)
-    # this option is supported on Linux only
-    target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
-endif()
-target_include_directories(${python_module_name}
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
-)
+
 set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
-target_link_options(${python_module_name} PRIVATE ${_linker_options})
-add_dependencies(${python_module_name} _dpctl4pybind11_deps)
-install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
+foreach(python_module_name ${_py_trgts})
+    target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
+    target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
+    if(UNIX)
+        # this option is supported on Linux only
+        target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
+    endif()
+    target_include_directories(${python_module_name}
+        PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../include
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+    )
+    target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    add_dependencies(${python_module_name} _dpctl4pybind11_deps)
+    install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
+endforeach()
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index f0930004ec..4355fea442 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -90,9 +90,12 @@
 )
 from dpctl.tensor._reshape import reshape
 from dpctl.tensor._search_functions import where
+from dpctl.tensor._statistical_functions import mean, std, var
 from dpctl.tensor._usmarray import usm_ndarray
 from dpctl.tensor._utility_functions import all, any
 
+from ._array_api import __array_api_version__, __array_namespace_info__
+from ._clip import clip
 from ._constants import e, inf, nan, newaxis, pi
 from ._elementwise_funcs import (
     abs,
@@ -110,13 +113,16 @@
     bitwise_or,
     bitwise_right_shift,
     bitwise_xor,
+    cbrt,
     ceil,
     conj,
+    copysign,
     cos,
     cosh,
     divide,
     equal,
     exp,
+    exp2,
     expm1,
     floor,
     floor_divide,
@@ -149,6 +155,7 @@
     real,
     remainder,
     round,
+    rsqrt,
     sign,
     signbit,
     sin,
@@ -160,7 +167,16 @@
     tanh,
     trunc,
 )
-from ._reduction import sum
+from ._reduction import (
+    argmax,
+    argmin,
+    logsumexp,
+    max,
+    min,
+    prod,
+    reduce_hypot,
+    sum,
+)
 from ._testing import allclose
 
 __all__ = [
@@ -309,4 +325,21 @@
     "allclose",
     "repeat",
     "tile",
+    "max",
+    "min",
+    "argmax",
+    "argmin",
+    "prod",
+    "cbrt",
+    "exp2",
+    "copysign",
+    "rsqrt",
+    "clip",
+    "logsumexp",
+    "reduce_hypot",
+    "mean",
+    "std",
+    "var",
+    "__array_api_version__",
+    "__array_namespace_info__",
 ]
diff --git a/dpctl/tensor/_array_api.py b/dpctl/tensor/_array_api.py
new file mode 100644
index 0000000000..613d6dcd66
--- /dev/null
+++ b/dpctl/tensor/_array_api.py
@@ -0,0 +1,207 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tensor._tensor_impl import (
+    default_device_complex_type,
+    default_device_fp_type,
+    default_device_index_type,
+    default_device_int_type,
+)
+
+
+def _isdtype_impl(dtype, kind):
+    if isinstance(kind, str):
+        if kind == "bool":
+            return dtype.kind == "b"
+        elif kind == "signed integer":
+            return dtype.kind == "i"
+        elif kind == "unsigned integer":
+            return dtype.kind == "u"
+        elif kind == "integral":
+            return dtype.kind in "iu"
+        elif kind == "real floating":
+            return dtype.kind == "f"
+        elif kind == "complex floating":
+            return dtype.kind == "c"
+        elif kind == "numeric":
+            return dtype.kind in "iufc"
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind}")
+
+    elif isinstance(kind, tuple):
+        return any(_isdtype_impl(dtype, k) for k in kind)
+    else:
+        raise TypeError(f"Unsupported data type kind: {kind}")
+
+
+__array_api_version__ = "2022.12"
+
+
+class Info:
+    """
+    namespace returned by `__array_namespace_info__()`
+    """
+
+    def __init__(self):
+        self._capabilities = {
+            "boolean_indexing": True,
+            "data_dependent_shapes": True,
+        }
+        self._all_dtypes = {
+            "bool": dpt.bool,
+            "float32": dpt.float32,
+            "float64": dpt.float64,
+            "complex64": dpt.complex64,
+            "complex128": dpt.complex128,
+            "int8": dpt.int8,
+            "int16": dpt.int16,
+            "int32": dpt.int32,
+            "int64": dpt.int64,
+            "uint8": dpt.uint8,
+            "uint16": dpt.uint16,
+            "uint32": dpt.uint32,
+            "uint64": dpt.uint64,
+        }
+
+    def capabilities(self):
+        """
+        Returns a dictionary of `dpctl`'s capabilities.
+
+        Returns:
+            dict:
+                dictionary of `dpctl`'s capabilities
+                - `boolean_indexing`: bool
+                - `data_dependent_shapes`: bool
+        """
+        return self._capabilities.copy()
+
+    def default_device(self):
+        """
+        Returns the default SYCL device.
+        """
+        return dpctl.select_default_device()
+
+    def default_dtypes(self, device=None):
+        """
+        Returns a dictionary of default data types for `device`.
+
+        Args:
+            device (Optional[dpctl.SyclDevice, dpctl.SyclQueue,
+            dpctl.tensor.Device]):
+                array API concept of device used in getting default data types.
+                `device` can be `None` (in which case the default device is
+                used), an instance of :class:`dpctl.SyclDevice` corresponding
+                to a non-partitioned SYCL device, an instance of
+                :class:`dpctl.SyclQueue`, or a `Device` object returned by
+                :attr:`dpctl.tensor.usm_array.device`. Default: `None`.
+
+        Returns:
+            dict:
+                a dictionary of default data types for `device`
+                    - `real floating`: dtype
+                    - `complex floating`: dtype
+                    - `integral`: dtype
+                    - `indexing`: dtype
+        """
+        if device is None:
+            device = dpctl.select_default_device()
+        elif isinstance(device, dpt.Device):
+            device = device.sycl_device
+        return {
+            "real floating": dpt.dtype(default_device_fp_type(device)),
+            "complex floating": dpt.dtype(default_device_complex_type(device)),
+            "integral": dpt.dtype(default_device_int_type(device)),
+            "indexing": dpt.dtype(default_device_index_type(device)),
+        }
+
+    def dtypes(self, device=None, kind=None):
+        """
+        Returns a dictionary of all Array API data types of a specified `kind`
+        supported by `device`
+
+        This dictionary only includes data types supported by the array API.
+
+        See [array API](array_api).
+
+        [array_api]: https://data-apis.org/array-api/latest/
+
+        Args:
+            device (Optional[dpctl.SyclDevice, dpctl.SyclQueue,
+            dpctl.tensor.Device, str]):
+                array API concept of device used in getting default data types.
+                `device` can be `None` (in which case the default device is
+                used), an instance of :class:`dpctl.SyclDevice` corresponding
+                to a non-partitioned SYCL device, an instance of
+                :class:`dpctl.SyclQueue`, or a `Device` object returned by
+                :attr:`dpctl.tensor.usm_array.device`. Default: `None`.
+
+            kind (Optional[str, Tuple[str, ...]]):
+                data type kind.
+                - if `kind` is `None`, returns a dictionary of all data types
+                supported by `device`
+                - if `kind` is a string, returns a dictionary containing the
+                data types belonging to the data type kind specified.
+                Supports:
+                    - "bool"
+                    - "signed integer"
+                    - "unsigned integer"
+                    - "integral"
+                    - "real floating"
+                    - "complex floating"
+                    - "numeric"
+                - if `kind` is a tuple, the tuple represents a union of `kind`
+                strings, and returns a dictionary containing data types
+                corresponding to the-specified union.
+                Default: `None`.
+
+        Returns:
+            dict:
+                a dictionary of the supported data types of the specified `kind`
+        """
+        if device is None:
+            device = dpctl.select_default_device()
+        elif isinstance(device, dpt.Device):
+            device = device.sycl_device
+        _fp64 = device.has_aspect_fp64
+        if kind is None:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if (key != "float64" or _fp64)
+            }
+        else:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if (key != "float64" or _fp64) and _isdtype_impl(val, kind)
+            }
+
+    def devices(self):
+        """
+        Returns a list of supported devices.
+        """
+        return dpctl.get_devices()
+
+
+def __array_namespace_info__():
+    """__array_namespace_info__()
+
+    Returns a namespace with Array API namespace inspection utilities.
+
+    """
+    return Info()
diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py
new file mode 100644
index 0000000000..eeed87b404
--- /dev/null
+++ b/dpctl/tensor/_clip.py
@@ -0,0 +1,838 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
+import dpctl.tensor._tensor_impl as ti
+from dpctl.tensor._copy_utils import (
+    _empty_like_orderK,
+    _empty_like_pair_orderK,
+    _empty_like_triple_orderK,
+)
+from dpctl.tensor._elementwise_common import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _strong_dtype_num_kind,
+    _validate_dtype,
+    _weak_type_num_kind,
+)
+from dpctl.tensor._manipulation_functions import _broadcast_shape_impl
+from dpctl.tensor._type_utils import _can_cast, _to_device_supported_dtype
+from dpctl.utils import ExecutionPlacementError
+
+
+def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev):
+    "Resolves weak data types per NEP-0050,"
+    "where the second and third arguments are"
+    "permitted to be weak types"
+    if isinstance(
+        st_dtype,
+        (
+            WeakBooleanType,
+            WeakIntegralType,
+            WeakFloatingType,
+            WeakComplexType,
+        ),
+    ):
+        raise ValueError
+    if isinstance(
+        dtype1,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        if isinstance(
+            dtype2,
+            (
+                WeakBooleanType,
+                WeakIntegralType,
+                WeakFloatingType,
+                WeakComplexType,
+            ),
+        ):
+            kind_num1 = _weak_type_num_kind(dtype1)
+            kind_num2 = _weak_type_num_kind(dtype2)
+            st_kind_num = _strong_dtype_num_kind(st_dtype)
+
+            if kind_num1 > st_kind_num:
+                if isinstance(dtype1, WeakIntegralType):
+                    ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype1, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype1 = dpt.complex64
+                    ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype1 = st_dtype
+
+            if kind_num2 > st_kind_num:
+                if isinstance(dtype2, WeakIntegralType):
+                    ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype2, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype2 = dpt.complex64
+                    ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype2 = st_dtype
+
+            return ret_dtype1, ret_dtype2
+
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype2), dtype2),
+            ]
+        )
+        dt1_kind_num = _weak_type_num_kind(dtype1)
+        if dt1_kind_num > max_dt_num_kind:
+            if isinstance(dtype1, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), dtype2
+            if isinstance(dtype1, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dpt.complex64, dtype2
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    dtype2,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), dtype2
+        else:
+            return max_dtype, dtype2
+    elif isinstance(
+        dtype2,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype1), dtype1),
+            ]
+        )
+        dt2_kind_num = _weak_type_num_kind(dtype2)
+        if dt2_kind_num > max_dt_num_kind:
+            if isinstance(dtype2, WeakIntegralType):
+                return dtype1, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype2, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dtype1, dpt.complex64
+                return (
+                    dtype1,
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                )
+            return dtype1, _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return dtype1, max_dtype
+    else:
+        # both are strong dtypes
+        # return unmodified
+        return dtype1, dtype2
+
+
+def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev):
+    "Resolves one weak data type with one strong data type per NEP-0050"
+    if isinstance(
+        st_dtype,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        raise ValueError
+    if isinstance(
+        dtype,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        st_kind_num = _strong_dtype_num_kind(st_dtype)
+        kind_num = _weak_type_num_kind(dtype)
+        if kind_num > st_kind_num:
+            if isinstance(dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype, WeakComplexType):
+                if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                    return st_dtype, dpt.complex64
+                return _to_device_supported_dtype(dpt.complex128, dev)
+            return (_to_device_supported_dtype(dpt.float64, dev),)
+        else:
+            return st_dtype
+    else:
+        return dtype
+
+
+def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev):
+    "Checks if both types `arg1_dtype` and `arg2_dtype` can be"
+    "cast to `res_dtype` according to the rule `safe`"
+    if arg1_dtype == res_dtype and arg2_dtype == res_dtype:
+        return None, None, res_dtype
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast(
+        arg2_dtype, res_dtype, _fp16, _fp64
+    ):
+        # prevent unnecessary casting
+        ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype
+        ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype
+        return ret_buf1_dt, ret_buf2_dt, res_dtype
+    else:
+        return None, None, None
+
+
+def _clip_none(x, val, out, order, _binary_fn):
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    q1, x_usm_type = x.sycl_queue, x.usm_type
+    q2, val_usm_type = _get_queue_usm_type(val)
+    if q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                x_usm_type,
+                val_usm_type,
+            )
+        )
+    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    x_shape = x.shape
+    val_shape = _get_shape(val)
+    if not isinstance(val_shape, (tuple, list)):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                x_shape,
+                val_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{x_shape} and {val_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x_dtype = x.dtype
+    val_dtype = _get_dtype(val, sycl_dev)
+    if not _validate_dtype(val_dtype):
+        raise ValueError("Operands have unsupported data types")
+
+    val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev)
+
+    res_dt = x.dtype
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if not _can_cast(val_dtype, res_dt, _fp16, _fp64):
+        raise ValueError(
+            f"function 'clip' does not support input types "
+            f"({x_dtype}, {val_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(x, out):
+            if not ti._same_logical_tensors(x, out):
+                out = dpt.empty_like(out)
+
+        if isinstance(val, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(val, out)
+                and not ti._same_logical_tensors(val, out)
+                and val_dtype == res_dt
+            ):
+                out = dpt.empty_like(out)
+
+    if isinstance(val, dpt.usm_ndarray):
+        val_ary = val
+    else:
+        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+
+    if val_dtype == res_dt:
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, val_ary, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                if order == "A":
+                    order = (
+                        "F"
+                        if all(
+                            arr.flags.f_contiguous
+                            for arr in (
+                                x,
+                                val_ary,
+                            )
+                        )
+                        else "C"
+                    )
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        if val_ary.shape != res_shape:
+            val_ary = dpt.broadcast_to(val_ary, res_shape)
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x, src2=val_ary, dst=out, sycl_queue=exec_q
+        )
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            ht_copy_out_ev.wait()
+            out = orig_out
+        ht_binary_ev.wait()
+        return out
+    else:
+        if order == "K":
+            buf = _empty_like_orderK(val_ary, res_dt)
+        else:
+            if order == "A":
+                order = "F" if x.flags.f_contiguous else "C"
+            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=val_ary, dst=buf, sycl_queue=exec_q
+        )
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, buf, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        buf = dpt.broadcast_to(buf, res_shape)
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x,
+            src2=buf,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            ht_copy_out_ev.wait()
+            out = orig_out
+        ht_copy_ev.wait()
+        ht_binary_ev.wait()
+        return out
+
+
+# need to handle logic for min or max being None
+def clip(x, min=None, max=None, out=None, order="K"):
+    """clip(x, min, max, out=None, order="K")
+
+    Clips to the range [`min_i`, `max_i`] for each element `x_i`
+    in `x`.
+
+    Args:
+        x (usm_ndarray): Array containing elements to clip.
+            Must be compatible with `min` and `max` according
+            to broadcasting rules.
+        min ({None, usm_ndarray}, optional): Array containing minimum values.
+            Must be compatible with `x` and `max` according
+            to broadcasting rules.
+            Only one of `min` and `max` can be `None`.
+        max ({None, usm_ndarray}, optional): Array containing maximum values.
+            Must be compatible with `x` and `min` according
+            to broadcasting rules.
+            Only one of `min` and `max` can be `None`.
+        out ({None, usm_ndarray}, optional):
+            Output array to populate.
+            Array must have the correct shape and the expected data type.
+        order ("C","F","A","K", optional):
+            Memory layout of the newly output array, if parameter `out` is
+            `None`.
+            Default: "K".
+
+    Returns:
+        usm_ndarray:
+            An array with elements clipped to the range [`min`, `max`].
+            The returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected `x` to be of dpctl.tensor.usm_ndarray type, got "
+            f"{type(x)}"
+        )
+    if min is None and max is None:
+        raise ValueError(
+            "only one of `min` and `max` is permitted to be `None`"
+        )
+    elif max is None:
+        return _clip_none(x, min, out, order, tei._maximum)
+    elif min is None:
+        return _clip_none(x, max, out, order, tei._minimum)
+    else:
+        q1, x_usm_type = x.sycl_queue, x.usm_type
+        q2, min_usm_type = _get_queue_usm_type(min)
+        q3, max_usm_type = _get_queue_usm_type(max)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            res_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    max_usm_type,
+                )
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                    max_usm_type,
+                )
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        x_shape = x.shape
+        min_shape = _get_shape(min)
+        max_shape = _get_shape(max)
+        if not all(
+            isinstance(s, (tuple, list))
+            for s in (
+                min_shape,
+                max_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    x_shape,
+                    min_shape,
+                    max_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{x_shape}, {min_shape}, and {max_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        x_dtype = x.dtype
+        min_dtype = _get_dtype(min, sycl_dev)
+        max_dtype = _get_dtype(max, sycl_dev)
+        if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)):
+            raise ValueError("Operands have unsupported data types")
+
+        min_dtype, max_dtype = _resolve_one_strong_two_weak_types(
+            x_dtype, min_dtype, max_dtype, sycl_dev
+        )
+
+        buf1_dt, buf2_dt, res_dt = _check_clip_dtypes(
+            x_dtype,
+            min_dtype,
+            max_dtype,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{clip}' does not support input types "
+                f"({x_dtype}, {min_dtype}, {max_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if out.shape != res_shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {res_shape}, "
+                    f"got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+
+            if isinstance(min, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(min, out)
+                    and not ti._same_logical_tensors(min, out)
+                    and buf1_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+            if isinstance(max, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(max, out)
+                    and not ti._same_logical_tensors(max, out)
+                    and buf2_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+        if isinstance(min, dpt.usm_ndarray):
+            a_min = min
+        else:
+            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+        if isinstance(max, dpt.usm_ndarray):
+            a_max = max
+        else:
+            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+
+        if buf1_dt is None and buf2_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    if order == "A":
+                        order = (
+                            "F"
+                            if all(
+                                arr.flags.f_contiguous
+                                for arr in (
+                                    x,
+                                    a_min,
+                                    a_max,
+                                )
+                            )
+                            else "C"
+                        )
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+            if x_shape != res_shape:
+                x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x, min=a_min, max=a_max, dst=out, sycl_queue=exec_q
+            )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
+            ht_binary_ev.wait()
+            return out
+
+        elif buf1_dt is None:
+            if order == "K":
+                buf2 = _empty_like_orderK(a_max, buf2_dt)
+            else:
+                if order == "A":
+                    order = (
+                        "F"
+                        if all(
+                            arr.flags.f_contiguous
+                            for arr in (
+                                x,
+                                a_min,
+                            )
+                        )
+                        else "C"
+                    )
+                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_max, dst=buf2, sycl_queue=exec_q
+            )
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        buf2,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=buf2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
+            ht_copy_ev.wait()
+            ht_binary_ev.wait()
+            return out
+
+        elif buf2_dt is None:
+            if order == "K":
+                buf1 = _empty_like_orderK(a_min, buf1_dt)
+            else:
+                if order == "A":
+                    order = (
+                        "F"
+                        if all(
+                            arr.flags.f_contiguous
+                            for arr in (
+                                x,
+                                a_max,
+                            )
+                        )
+                        else "C"
+                    )
+                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_min, dst=buf1, sycl_queue=exec_q
+            )
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        buf1,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=buf1,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
+            ht_copy_ev.wait()
+            ht_binary_ev.wait()
+            return out
+
+        if order in ["K", "A"]:
+            if (
+                x.flags.f_contiguous
+                and a_min.flags.f_contiguous
+                and a_max.flags.f_contiguous
+            ):
+                order = "F"
+            elif (
+                x.flags.c_contiguous
+                and a_min.flags.c_contiguous
+                and a_max.flags.c_contiguous
+            ):
+                order = "C"
+            else:
+                order = "C" if order == "A" else "K"
+        if order == "K":
+            buf1 = _empty_like_orderK(a_min, buf1_dt)
+        else:
+            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_min, dst=buf1, sycl_queue=exec_q
+        )
+        if order == "K":
+            buf2 = _empty_like_orderK(a_max, buf2_dt)
+        else:
+            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_max, dst=buf2, sycl_queue=exec_q
+        )
+        if out is None:
+            if order == "K":
+                out = _empty_like_triple_orderK(
+                    x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        x = dpt.broadcast_to(x, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
+        ht_, _ = ti._clip(
+            src=x,
+            min=buf1,
+            max=buf2,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy1_ev, copy2_ev],
+        )
+        dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_])
+        return out
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index bc1b071460..81928692a6 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -37,7 +37,7 @@
 
 def _copy_to_numpy(ary):
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}")
     nb = ary.usm_data.nbytes
     hh = dpm.MemoryUSMHost(nb, queue=ary.sycl_queue)
     hh.copy_from_device(ary.usm_data)
diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index fca5b0734a..4a0d1c451f 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -39,6 +39,31 @@
 class UnaryElementwiseFunc:
     """
     Class that implements unary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovler_fn (callable):
+            Function that takes dtype of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        unary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values, effectively
+            evaluating `dst = func(src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
     """
 
     def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs):
@@ -55,8 +80,31 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    def get_implementation_function(self):
+        """Returns the implementation function for
+        this elementwise unary function.
+
+        """
+        return self.unary_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise unary function.
+        """
+        return self.result_type_resolver_fn_
+
     @property
     def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.sin.types
+                # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D']
+        """
         types = self.types_
         if not types:
             types = []
@@ -363,6 +411,56 @@ def _get_shape(o):
 class BinaryElementwiseFunc:
     """
     Class that implements binary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovle_fn (callable):
+            Function that takes dtypes of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        binary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src1` and `src2` are the argument arrays, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(src1, src2)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
+        binary_inplace_fn (callable, optional):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(dst, src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including async lifetime management of Python arguments,
+            while the second event corresponds to computational tasks
+            associated with function evaluation.
+        acceptance_fn (callable, optional):
+            Function to influence type promotion behavior of this binary
+            function. The function takes 6 arguments:
+                arg1_dtype - Data type of the first argument
+                arg2_dtype - Data type of the second argument
+                ret_buf1_dtype - Data type the first argument would be cast to
+                ret_buf2_dtype - Data type the second argument would be cast to
+                res_dtype - Data type of the output array with function values
+                sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                    evaluation is carried out.
+            The function is only called when both arguments of the binary
+            function require casting, e.g. both arguments of
+            `dpctl.tensor.logaddexp` are arrays with integral data type.
     """
 
     def __init__(
@@ -392,8 +490,60 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    def get_implementation_function(self):
+        """Returns the out-of-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_fn_
+
+    def get_implementation_inplace_function(self):
+        """Returns the in-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_inplace_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise binary function.
+        """
+        return self.result_type_resolver_fn_
+
+    def get_type_promotion_path_acceptance_function(self):
+        """Returns the acceptance function for this
+        elementwise binary function.
+
+        Acceptance function influences the type promotion
+        behavior of this binary function.
+        The function takes 6 arguments:
+            arg1_dtype - Data type of the first argument
+            arg2_dtype - Data type of the second argument
+            ret_buf1_dtype - Data type the first argument would be cast to
+            ret_buf2_dtype - Data type the second argument would be cast to
+            res_dtype - Data type of the output array with function values
+            sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation
+                is carried out.
+
+        The acceptance function is only invoked if both input arrays must be
+        cast to intermediary data types, as would happen during call of
+        `dpctl.tensor.hypot` with both arrays being of integral data type.
+        """
+        return self.acceptance_fn_
+
     @property
     def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.divide.types
+                # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D',
+                #    'Ff->F', 'FF->F', 'Dd->D', 'DD->D']
+        """
         types = self.types_
         if not types:
             types = []
@@ -649,12 +799,7 @@ def __call__(self, o1, o2, out=None, order="K"):
                         sycl_queue=exec_q,
                         order=order,
                     )
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f"got {out.dtype}"
-                    )
+
             if src1.shape != res_shape:
                 src1 = dpt.broadcast_to(src1, res_shape)
             buf2 = dpt.broadcast_to(buf2, res_shape)
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 8e2abee837..9879960999 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -14,7 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_elementwise_impl as ti
 
 from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
 from ._type_utils import _acceptance_fn_divide
@@ -297,6 +297,7 @@
     ti._bitwise_and_result_type,
     ti._bitwise_and,
     _bitwise_and_docstring_,
+    binary_inplace_fn=ti._bitwise_and_inplace,
 )
 
 # B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
@@ -330,6 +331,7 @@
     ti._bitwise_left_shift_result_type,
     ti._bitwise_left_shift,
     _bitwise_left_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_left_shift_inplace,
 )
 
 
@@ -393,6 +395,7 @@
     ti._bitwise_or_result_type,
     ti._bitwise_or,
     _bitwise_or_docstring_,
+    binary_inplace_fn=ti._bitwise_or_inplace,
 )
 
 # B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
@@ -425,6 +428,7 @@
     ti._bitwise_right_shift_result_type,
     ti._bitwise_right_shift,
     _bitwise_right_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_right_shift_inplace,
 )
 
 
@@ -459,6 +463,7 @@
     ti._bitwise_xor_result_type,
     ti._bitwise_xor,
     _bitwise_xor_docstring_,
+    binary_inplace_fn=ti._bitwise_xor_inplace,
 )
 
 
@@ -590,6 +595,7 @@
     ti._divide_result_type,
     ti._divide,
     _divide_docstring_,
+    binary_inplace_fn=ti._divide_inplace,
     acceptance_fn=_acceptance_fn_divide,
 )
 
@@ -720,6 +726,7 @@
     ti._floor_divide_result_type,
     ti._floor_divide,
     _floor_divide_docstring_,
+    binary_inplace_fn=ti._floor_divide_inplace,
 )
 
 # B11: ==== GREATER       (x1, x2)
@@ -1176,7 +1183,7 @@
     _logical_xor_docstring_,
 )
 
-# B??: ==== MAXIMUM    (x1, x2)
+# B26: ==== MAXIMUM    (x1, x2)
 _maximum_docstring_ = """
 maximum(x1, x2, out=None, order='K')
 
@@ -1206,7 +1213,7 @@
     _maximum_docstring_,
 )
 
-# B??: ==== MINIMUM    (x1, x2)
+# B27: ==== MINIMUM    (x1, x2)
 _minimum_docstring_ = """
 minimum(x1, x2, out=None, order='K')
 
@@ -1264,7 +1271,7 @@
     ti._multiply_result_type,
     ti._multiply,
     _multiply_docstring_,
-    ti._multiply_inplace,
+    binary_inplace_fn=ti._multiply_inplace,
 )
 
 # U25: ==== NEGATIVE    (x)
@@ -1359,10 +1366,14 @@
         the returned array is determined by the Type Promotion Rules.
 """
 pow = BinaryElementwiseFunc(
-    "pow", ti._pow_result_type, ti._pow, _pow_docstring_
+    "pow",
+    ti._pow_result_type,
+    ti._pow,
+    _pow_docstring_,
+    binary_inplace_fn=ti._pow_inplace,
 )
 
-# U??: ==== PROJ        (x)
+# U40: ==== PROJ        (x)
 _proj_docstring = """
 proj(x, out=None, order='K')
 
@@ -1441,7 +1452,11 @@
         the returned array is determined by the Type Promotion Rules.
 """
 remainder = BinaryElementwiseFunc(
-    "remainder", ti._remainder_result_type, ti._remainder, _remainder_docstring_
+    "remainder",
+    ti._remainder_result_type,
+    ti._remainder,
+    _remainder_docstring_,
+    binary_inplace_fn=ti._remainder_inplace,
 )
 
 # U28: ==== ROUND       (x)
@@ -1499,7 +1514,7 @@
     "sign", ti._sign_result_type, ti._sign, _sign_docstring
 )
 
-# ==== SIGNBIT        (x)
+# U41: ==== SIGNBIT        (x)
 _signbit_docstring = """
 signbit(x, out=None, order='K')
 
@@ -1652,7 +1667,7 @@
     ti._subtract_result_type,
     ti._subtract,
     _subtract_docstring_,
-    ti._subtract_inplace,
+    binary_inplace_fn=ti._subtract_inplace,
 )
 
 
@@ -1759,3 +1774,116 @@
 hypot = BinaryElementwiseFunc(
     "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_
 )
+
+
+# U37: ==== CBRT        (x)
+_cbrt_docstring_ = """
+cbrt(x, out=None, order='K')
+
+Computes positive cube-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise positive cube-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+cbrt = UnaryElementwiseFunc(
+    "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_
+)
+
+
+# U38: ==== EXP2        (x)
+_exp2_docstring_ = """
+exp2(x, out=None, order='K')
+
+Computes the base-2 exponential for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise base-2 exponentials.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+exp2 = UnaryElementwiseFunc(
+    "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_
+)
+
+
+# B25: ==== COPYSIGN    (x1, x2)
+_copysign_docstring_ = """
+copysign(x1, x2, out=None, order='K')
+
+Composes a floating-point value with the magnitude of `x1_i` and the sign of
+`x2_i` for each element of input arrays `x1` and `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real floating-point data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real floating-point data
+        type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+copysign = BinaryElementwiseFunc(
+    "copysign",
+    ti._copysign_result_type,
+    ti._copysign,
+    _copysign_docstring_,
+)
+
+
+# U39: ==== RSQRT        (x)
+_rsqrt_docstring_ = """
+rsqrt(x, out=None, order='K')
+
+Computes the reciprocal square-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise reciprocal square-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+rsqrt = UnaryElementwiseFunc(
+    "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_
+)
diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index 7201cd96fb..7135304b58 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -19,7 +19,6 @@
 import operator
 
 import numpy as np
-from numpy import AxisError
 from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
 
 import dpctl
@@ -929,20 +928,26 @@ def repeat(x, repeats, axis=None):
     Args:
         x (usm_ndarray): input array
 
-        repeat (Union[int, Tuple[int, ...]]):
+        repeats (Union[int, Sequence[int, ...], usm_ndarray]):
             The number of repetitions for each element.
-            `repeats` is broadcasted to fit the shape of the given axis.
+            `repeats` is broadcast to fit the shape of the given axis.
+            If `repeats` is an array, it must have an integer data type.
+            Otherwise, `repeats` must be a Python integer, tuple, list, or
+            range.
 
         axis (Optional[int]):
-            The axis along which to repeat values. The `axis` is required
-            if input array has more than one dimension.
+            The axis along which to repeat values. If `axis` is `None`, the
+            function repeats elements of the flattened array.
+            Default: `None`.
 
     Returns:
         usm_narray:
             Array with repeated elements.
-            The returned array must have the same data type as `x`,
-            is created on the same device as `x` and has the same USM
-            allocation type as `x`.
+            The returned array must have the same data type as `x`, is created
+            on the same device as `x` and has the same USM allocation type as
+            `x`. If `axis` is `None`, the returned array is one-dimensional,
+            otherwise, it has the same shape as `x`, except for the axis along
+            which elements were repeated.
 
     Raises:
         AxisError: if `axis` value is invalid.
@@ -951,20 +956,11 @@ def repeat(x, repeats, axis=None):
         raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
 
     x_ndim = x.ndim
-    if axis is None:
-        if x_ndim > 1:
-            raise ValueError(
-                f"`axis` cannot be `None` for array of dimension {x_ndim}"
-            )
-        axis = 0
-
     x_shape = x.shape
-    if x_ndim > 0:
+    if axis is not None:
         axis = normalize_axis_index(operator.index(axis), x_ndim)
         axis_size = x_shape[axis]
     else:
-        if axis != 0:
-            AxisError("`axis` must be `0` for input of dimension `0`")
         axis_size = x.size
 
     scalar = False
@@ -977,8 +973,8 @@ def repeat(x, repeats, axis=None):
     elif isinstance(repeats, dpt.usm_ndarray):
         if repeats.ndim > 1:
             raise ValueError(
-                "`repeats` array must be 0- or 1-dimensional, got"
-                "{repeats.ndim}"
+                "`repeats` array must be 0- or 1-dimensional, got "
+                f"{repeats.ndim}"
             )
         exec_q = dpctl.utils.get_execution_queue(
             (x.sycl_queue, repeats.sycl_queue)
@@ -1015,22 +1011,22 @@ def repeat(x, repeats, axis=None):
             if not dpt.all(repeats >= 0):
                 raise ValueError("`repeats` elements must be positive")
 
-    elif isinstance(repeats, tuple):
+    elif isinstance(repeats, (tuple, list, range)):
         usm_type = x.usm_type
         exec_q = x.sycl_queue
 
         len_reps = len(repeats)
-        if len_reps != axis_size:
-            raise ValueError(
-                "`repeats` tuple must have the same length as the repeated "
-                "axis"
-            )
-        elif len_reps == 1:
+        if len_reps == 1:
             repeats = repeats[0]
             if repeats < 0:
                 raise ValueError("`repeats` elements must be positive")
             scalar = True
         else:
+            if len_reps != axis_size:
+                raise ValueError(
+                    "`repeats` sequence must have the same length as the "
+                    "repeated axis"
+                )
             repeats = dpt.asarray(
                 repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
             )
@@ -1038,7 +1034,7 @@ def repeat(x, repeats, axis=None):
                 raise ValueError("`repeats` elements must be positive")
     else:
         raise TypeError(
-            "Expected int, tuple, or `usm_ndarray` for second argument,"
+            "Expected int, sequence, or `usm_ndarray` for second argument,"
             f"got {type(repeats)}"
         )
 
@@ -1047,7 +1043,10 @@ def repeat(x, repeats, axis=None):
 
     if scalar:
         res_axis_size = repeats * axis_size
-        res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+        if axis is not None:
+            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+        else:
+            res_shape = (res_axis_size,)
         res = dpt.empty(
             res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
         )
@@ -1081,9 +1080,17 @@ def repeat(x, repeats, axis=None):
             res_axis_size = ti._cumsum_1d(
                 rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]
             )
-            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
             res = dpt.empty(
-                res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
             )
             if res_axis_size > 0:
                 ht_rep_ev, _ = ti._repeat_by_sequence(
@@ -1103,11 +1110,18 @@ def repeat(x, repeats, axis=None):
                 usm_type=usm_type,
                 sycl_queue=exec_q,
             )
-            # _cumsum_1d synchronizes so `depends` ends here safely
             res_axis_size = ti._cumsum_1d(repeats, cumsum, sycl_queue=exec_q)
-            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
             res = dpt.empty(
-                res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
             )
             if res_axis_size > 0:
                 ht_rep_ev, _ = ti._repeat_by_sequence(
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index d9bd6b5b2b..059ba61030 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -19,6 +19,7 @@
 import dpctl
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_reductions_impl as tri
 
 from ._type_utils import _to_device_supported_dtype
 
@@ -52,18 +53,143 @@ def _default_reduction_dtype(inp_dt, q):
     return res_dt
 
 
-def sum(arr, axis=None, dtype=None, keepdims=False):
+def _default_reduction_dtype_fp_types(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when reduction is performed on queue `q`
+    and the reduction supports only floating-point data types
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "biu":
+        res_dt = dpt.dtype(ti.default_device_fp_type(q))
+        can_cast_v = dpt.can_cast(inp_dt, res_dt)
+        if not can_cast_v:
+            _fp64 = q.sycl_device.has_aspect_fp64
+            res_dt = dpt.float64 if _fp64 else dpt.float32
+    elif inp_kind in "f":
+        res_dt = dpt.dtype(ti.default_device_fp_type(q))
+        if res_dt.itemsize < inp_dt.itemsize:
+            res_dt = inp_dt
+    elif inp_kind in "c":
+        raise TypeError("reduction not defined for complex types")
+
+    return res_dt
+
+
+def _reduction_over_axis(
+    x,
+    axis,
+    dtype,
+    keepdims,
+    _reduction_fn,
+    _dtype_supported,
+    _default_reduction_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    red_nd = len(axis)
+    perm = [i for i in range(nd) if i not in axis] + list(axis)
+    arr2 = dpt.permute_dims(x, perm)
+    res_shape = arr2.shape[: nd - red_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    if dtype is None:
+        res_dt = _default_reduction_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    res_usm_type = x.usm_type
+    if red_nd == 0:
+        return dpt.astype(x, res_dt, copy=True)
+
+    host_tasks_list = []
+    if _dtype_supported(inp_dt, res_dt, res_usm_type, q):
+        res = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e, _ = _reduction_fn(
+            src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q
+        )
+        host_tasks_list.append(ht_e)
+    else:
+        if dtype is None:
+            raise RuntimeError(
+                "Automatically determined reduction data type does not "
+                "have direct implementation"
+            )
+        if _dtype_supported(res_dt, res_dt, res_usm_type, q):
+            tmp = dpt.empty(
+                arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr2, dst=tmp, sycl_queue=q
+            )
+            host_tasks_list.append(ht_e_cpy)
+            res = dpt.empty(
+                res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_red, _ = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=res,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            host_tasks_list.append(ht_e_red)
+        else:
+            buf_dt = _default_reduction_type_fn(inp_dt, q)
+            tmp = dpt.empty(
+                arr2.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr2, dst=tmp, sycl_queue=q
+            )
+            tmp_res = dpt.empty(
+                res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            host_tasks_list.append(ht_e_cpy)
+            res = dpt.empty(
+                res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_red, r_e = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=tmp_res,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=res, sycl_queue=q, depends=[r_e]
+            )
+            host_tasks_list.append(ht_e_cpy2)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+
+    return res
+
+
+def sum(x, axis=None, dtype=None, keepdims=False):
     """sum(x, axis=None, dtype=None, keepdims=False)
 
-    Calculates the sum of the input array `x`.
+    Calculates the sum of elements in the input array `x`.
 
     Args:
         x (usm_ndarray):
             input array.
-        axis (Optional[int, Tuple[int,...]]):
+        axis (Optional[int, Tuple[int, ...]]):
             axis or axes along which sums must be computed. If a tuple
             of unique integers, sums are computed over multiple axes.
-            If `None`, the sum if computed over the entire array.
+            If `None`, the sum is computed over the entire array.
             Default: `None`.
         dtype (Optional[dtype]):
             data type of the returned array. If `None`, the default data
@@ -72,13 +198,13 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
                   the returned array will have the default real-valued
                   floating-point data type for the device where input
                   array `x` is allocated.
-                * If x` has signed integral data type, the returned array
+                * If `x` has signed integral data type, the returned array
                   will have the default signed integral type for the device
                   where input array `x` is allocated.
                 * If `x` has unsigned integral data type, the returned array
                   will have the default unsigned integral type for the device
                   where input array `x` is allocated.
-                * If `x` has a complex-valued floating-point data typee,
+                * If `x` has a complex-valued floating-point data type,
                   the returned array will have the default complex-valued
                   floating-pointer data type for the device where input
                   array `x` is allocated.
@@ -101,9 +227,192 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
             array has the data type as described in the `dtype` parameter
             description above.
     """
-    if not isinstance(arr, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(arr)}")
-    nd = arr.ndim
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        tri._sum_over_axis,
+        tri._sum_over_axis_dtype_supported,
+        _default_reduction_dtype,
+    )
+
+
+def prod(x, axis=None, dtype=None, keepdims=False):
+    """prod(x, axis=None, dtype=None, keepdims=False)
+
+    Calculates the product of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which products must be computed. If a tuple
+            of unique integers, products are computed over multiple axes.
+            If `None`, the product is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+                * If `x` has a real-valued floating-point data type,
+                  the returned array will have the default real-valued
+                  floating-point data type for the device where input
+                  array `x` is allocated.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a complex-valued floating-point data type,
+                  the returned array will have the default complex-valued
+                  floating-pointer data type for the device where input
+                  array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the product. Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the products. If the product was computed over
+            the entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        tri._prod_over_axis,
+        tri._prod_over_axis_dtype_supported,
+        _default_reduction_dtype,
+    )
+
+
+def logsumexp(x, axis=None, dtype=None, keepdims=False):
+    """logsumexp(x, axis=None, dtype=None, keepdims=False)
+
+    Calculates the logarithm of the sum of exponentials of elements in the
+    input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If `None`, the result is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+                * If `x` has a real-valued floating-point data type,
+                  the returned array will have the default real-valued
+                  floating-point data type for the device where input
+                  array `x` is allocated.
+                * If `x` has a boolean or integral data type, the returned array
+                  will have the default floating point data type for the device
+                  where input array `x` is allocated.
+                * If `x` has a complex-valued floating-point data type,
+                  an error is raised.
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the result. Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        tri._logsumexp_over_axis,
+        lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_reduction_dtype_fp_types,
+    )
+
+
+def reduce_hypot(x, axis=None, dtype=None, keepdims=False):
+    """reduce_hypot(x, axis=None, dtype=None, keepdims=False)
+
+    Calculates the square root of the sum of squares of elements in the input
+    array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If `None`, the result is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+                * If `x` has a real-valued floating-point data type,
+                  the returned array will have the default real-valued
+                  floating-point data type for the device where input
+                  array `x` is allocated.
+                * If `x` has a boolean or integral data type, the returned array
+                  will have the default floating point data type for the device
+                  where input array `x` is allocated.
+                * If `x` has a complex-valued floating-point data type,
+                  an error is raised.
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the result. Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        tri._hypot_over_axis,
+        lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_reduction_dtype_fp_types,
+    )
+
+
+def _comparison_over_axis(x, axis, keepdims, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
     if axis is None:
         axis = tuple(range(nd))
     if not isinstance(axis, (tuple, list)):
@@ -111,63 +420,221 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
     axis = normalize_axis_tuple(axis, nd, "axis")
     red_nd = len(axis)
     perm = [i for i in range(nd) if i not in axis] + list(axis)
-    arr2 = dpt.permute_dims(arr, perm)
-    res_shape = arr2.shape[: nd - red_nd]
-    q = arr.sycl_queue
-    inp_dt = arr.dtype
-    if dtype is None:
-        res_dt = _default_reduction_dtype(inp_dt, q)
-    else:
-        res_dt = dpt.dtype(dtype)
-        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
-
-    res_usm_type = arr.usm_type
-    if arr.size == 0:
-        if keepdims:
-            res_shape = res_shape + (1,) * red_nd
-            inv_perm = sorted(range(nd), key=lambda d: perm[d])
-            res_shape = tuple(res_shape[i] for i in inv_perm)
-        return dpt.zeros(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
+    x_tmp = dpt.permute_dims(x, perm)
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = x.dtype
+    res_usm_type = x.usm_type
+    if x.size == 0:
+        if any([x.shape[i] == 0 for i in axis]):
+            raise ValueError(
+                "reduction cannot be performed over zero-size axes"
+            )
+        else:
+            return dpt.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+            )
     if red_nd == 0:
-        return dpt.astype(arr, res_dt, copy=False)
+        return dpt.copy(x)
 
-    host_tasks_list = []
-    if ti._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
-        res = dpt.empty(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
-        ht_e, _ = ti._sum_over_axis(
-            src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q
-        )
-        host_tasks_list.append(ht_e)
+    res = dpt.empty(
+        res_shape,
+        dtype=res_dt,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    hev, _ = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=res,
+        sycl_queue=exec_q,
+    )
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    hev.wait()
+    return res
+
+
+def max(x, axis=None, keepdims=False):
+    """max(x, axis=None, keepdims=False)
+
+    Calculates the maximum value of the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which maxima must be computed. If a tuple
+            of unique integers, the maxima are computed over multiple axes.
+            If `None`, the max is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the maxima. If the max was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as `x`.
+    """
+    return _comparison_over_axis(x, axis, keepdims, tri._max_over_axis)
+
+
+def min(x, axis=None, keepdims=False):
+    """min(x, axis=None, keepdims=False)
+
+    Calculates the minimum value of the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which minima must be computed. If a tuple
+            of unique integers, the minima are computed over multiple axes.
+            If `None`, the min is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the minima. If the min was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as `x`.
+    """
+    return _comparison_over_axis(x, axis, keepdims, tri._min_over_axis)
+
+
+def _search_over_axis(x, axis, keepdims, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    elif isinstance(axis, int):
+        axis = (axis,)
     else:
-        if dtype is None:
-            raise RuntimeError(
-                "Automatically determined reduction data type does not "
-                "have direct implementation"
-            )
-        tmp_dt = _default_reduction_dtype(inp_dt, q)
-        tmp = dpt.empty(
-            res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q
-        )
-        ht_e_tmp, r_e = ti._sum_over_axis(
-            src=arr2, trailing_dims_to_reduce=red_nd, dst=tmp, sycl_queue=q
-        )
-        host_tasks_list.append(ht_e_tmp)
-        res = dpt.empty(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        raise TypeError(
+            f"`axis` argument expected `int` or `None`, got {type(axis)}"
         )
-        ht_e, _ = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=tmp, dst=res, sycl_queue=q, depends=[r_e]
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    red_nd = len(axis)
+    perm = [i for i in range(nd) if i not in axis] + list(axis)
+    x_tmp = dpt.permute_dims(x, perm)
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = ti.default_device_index_type(exec_q.sycl_device)
+    res_usm_type = x.usm_type
+    if x.size == 0:
+        if any([x.shape[i] == 0 for i in axis]):
+            raise ValueError(
+                "reduction cannot be performed over zero-size axes"
+            )
+        else:
+            return dpt.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+            )
+    if red_nd == 0:
+        return dpt.zeros(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
-        host_tasks_list.append(ht_e)
+
+    res = dpt.empty(
+        res_shape,
+        dtype=res_dt,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    hev, _ = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=res,
+        sycl_queue=exec_q,
+    )
 
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
         res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
-    dpctl.SyclEvent.wait_for(host_tasks_list)
-
+    hev.wait()
     return res
+
+
+def argmax(x, axis=None, keepdims=False):
+    """argmax(x, axis=None, keepdims=False)
+
+    Returns the indices of the maximum values of the input array `x` along a
+    specified axis.
+
+    When the maximum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If `None`, returns the index of the
+            maximum value of the flattened array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            maximum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of `x`.
+    """
+    return _search_over_axis(x, axis, keepdims, tri._argmax_over_axis)
+
+
+def argmin(x, axis=None, keepdims=False):
+    """argmin(x, axis=None, keepdims=False)
+
+    Returns the indices of the minimum values of the input array `x` along a
+    specified axis.
+
+    When the minimum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If `None`, returns the index of the
+            minimum value of the flattened array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            minimum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of `x`.
+    """
+    return _search_over_axis(x, axis, keepdims, tri._argmin_over_axis)
diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py
new file mode 100644
index 0000000000..54d748d2d2
--- /dev/null
+++ b/dpctl/tensor/_statistical_functions.py
@@ -0,0 +1,381 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from numpy.core.numeric import normalize_axis_tuple
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
+import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_reductions_impl as tri
+
+
+def _var_impl(x, axis, correction, keepdims):
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    red_nd = len(axis)
+    perm = perm + list(axis)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        inp_dt
+        if inp_dt.kind == "f"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+
+    deps = []
+    host_tasks_list = []
+    if inp_dt != res_dt:
+        buf = dpt.empty_like(x, dtype=res_dt)
+        ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=buf, sycl_queue=q
+        )
+        deps.append(c_e1)
+        host_tasks_list.append(ht_e_buf)
+    else:
+        buf = x
+    # calculate mean
+    buf2 = dpt.permute_dims(buf, perm)
+    res_shape = buf2.shape[: nd - red_nd]
+    # use keepdims=True path for later broadcasting
+    if red_nd == 0:
+        mean_ary = dpt.empty_like(buf)
+        ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=buf, dst=mean_ary, sycl_queue=q
+        )
+        deps.append(c_e2)
+        host_tasks_list.append(ht_e1)
+    else:
+        mean_ary = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        ht_e1, r_e1 = tri._sum_over_axis(
+            src=buf2,
+            trailing_dims_to_reduce=red_nd,
+            dst=mean_ary,
+            sycl_queue=q,
+            depends=deps,
+        )
+        host_tasks_list.append(ht_e1)
+        deps.append(r_e1)
+
+        mean_ary_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        mean_ary = dpt.permute_dims(
+            dpt.reshape(mean_ary, mean_ary_shape), inv_perm
+        )
+    # divide in-place to get mean
+    mean_ary_shape = mean_ary.shape
+    nelems_ary = dpt.asarray(
+        nelems, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+    )
+    if nelems_ary.shape != mean_ary_shape:
+        nelems_ary = dpt.broadcast_to(nelems_ary, mean_ary_shape)
+    ht_e2, d_e1 = tei._divide_inplace(
+        lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=deps
+    )
+    host_tasks_list.append(ht_e2)
+    # subtract mean from original array to get deviations
+    dev_ary = dpt.empty_like(buf)
+    if mean_ary_shape != buf.shape:
+        mean_ary = dpt.broadcast_to(mean_ary, buf.shape)
+    ht_e4, su_e = tei._subtract(
+        src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1]
+    )
+    host_tasks_list.append(ht_e4)
+    # square deviations
+    ht_e5, sq_e = tei._square(
+        src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e]
+    )
+    host_tasks_list.append(ht_e5)
+    deps2 = []
+    # take sum of squared deviations
+    dev_ary2 = dpt.permute_dims(dev_ary, perm)
+    if red_nd == 0:
+        res = dev_ary
+        deps2.append(sq_e)
+    else:
+        res = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        ht_e6, r_e2 = tri._sum_over_axis(
+            src=dev_ary2,
+            trailing_dims_to_reduce=red_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[sq_e],
+        )
+        host_tasks_list.append(ht_e6)
+        deps2.append(r_e2)
+
+        if keepdims:
+            res_shape = res_shape + (1,) * red_nd
+            inv_perm = sorted(range(nd), key=lambda d: perm[d])
+            res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    res_shape = res.shape
+    # when nelems - correction <= 0, yield nans
+    div = max(nelems - correction, 0)
+    if not div:
+        div = dpt.nan
+    div_ary = dpt.asarray(div, res_dt, usm_type=res_usm_type, sycl_queue=q)
+    # divide in-place again
+    if div_ary.shape != res_shape:
+        div_ary = dpt.broadcast_to(div_ary, res.shape)
+    ht_e7, d_e2 = tei._divide_inplace(
+        lhs=res, rhs=div_ary, sycl_queue=q, depends=deps2
+    )
+    host_tasks_list.append(ht_e7)
+    return res, [d_e2], host_tasks_list
+
+
+def mean(x, axis=None, keepdims=False):
+    """mean(x, axis=None, keepdims=False)
+
+    Calculates the arithmetic mean of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the arithmetic means must be computed. If
+            a tuple of unique integers, the means are computed over multiple
+            axes. If `None`, the mean is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the arithmetic means. If the mean was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a floating-point data type, the returned array will have
+            the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    sum_nd = len(axis)
+    perm = perm + list(axis)
+    arr2 = dpt.permute_dims(x, perm)
+    res_shape = arr2.shape[: nd - sum_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        x.dtype
+        if x.dtype.kind in "fc"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+    if sum_nd == 0:
+        return dpt.astype(x, res_dt, copy=True)
+
+    s_e = []
+    host_tasks_list = []
+    if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
+        res = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e1, r_e = tri._sum_over_axis(
+            src=arr2, trailing_dims_to_reduce=sum_nd, dst=res, sycl_queue=q
+        )
+        host_tasks_list.append(ht_e1)
+        s_e.append(r_e)
+    else:
+        tmp = dpt.empty(
+            arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr2, dst=tmp, sycl_queue=q
+        )
+        host_tasks_list.append(ht_e_cpy)
+        res = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_red, r_e = tri._sum_over_axis(
+            src=tmp,
+            trailing_dims_to_reduce=sum_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[cpy_e],
+        )
+        host_tasks_list.append(ht_e_red)
+        s_e.append(r_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * sum_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+
+    res_shape = res.shape
+    # in-place divide
+    den_dt = dpt.finfo(res_dt).dtype if res_dt.kind == "c" else res_dt
+    nelems_arr = dpt.asarray(
+        nelems, dtype=den_dt, usm_type=res_usm_type, sycl_queue=q
+    )
+    if nelems_arr.shape != res_shape:
+        nelems_arr = dpt.broadcast_to(nelems_arr, res_shape)
+    ht_e2, _ = tei._divide_inplace(
+        lhs=res, rhs=nelems_arr, sycl_queue=q, depends=s_e
+    )
+    host_tasks_list.append(ht_e2)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
+
+
+def var(x, axis=None, correction=0.0, keepdims=False):
+    """var(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the variance of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the variances must be computed. If a tuple
+            of unique integers, the variances are computed over multiple axes.
+            If `None`, the variance is computed over the entire array.
+            Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            variance is `N-correction`, where `N` corresponds to the total
+            number of elements over which the variance is calculated.
+            Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the variances. If the variance was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`, got"
+            f"{type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`var` does not support complex types")
+
+    res, _, host_tasks_list = _var_impl(x, axis, correction, keepdims)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
+
+
+def std(x, axis=None, correction=0.0, keepdims=False):
+    """std(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the standard deviation of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the standard deviations must be computed.
+            If a tuple of unique integers, the standard deviations are computed
+            over multiple axes. If `None`, the standard deviation is computed
+            over the entire array. Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            standard deviation is `N-correction`, where `N` corresponds to the
+            total number of elements over which the standard deviation is
+            calculated. Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the standard deviations. If the standard
+            deviation was computed over the entire array, a zero-dimensional
+            array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`,"
+            f"got {type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`std` does not support complex types")
+
+    res, deps, host_tasks_list = _var_impl(x, axis, correction, keepdims)
+    ht_ev, _ = tei._sqrt(
+        src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps
+    )
+    host_tasks_list.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index ba18600135..5b394d971b 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -182,13 +182,20 @@ cdef class usm_ndarray:
         cdef bint is_fp16 = False
 
         self._reset()
-        if (not isinstance(shape, (list, tuple))
-                and not hasattr(shape, 'tolist')):
-            try:
-                <Py_ssize_t> shape
-                shape = [shape, ]
-            except Exception:
-                raise TypeError("Argument shape must be a list or a tuple.")
+        if not isinstance(shape, (list, tuple)):
+            if hasattr(shape, 'tolist'):
+                fn = getattr(shape, 'tolist')
+                if callable(fn):
+                    shape = shape.tolist()
+            if not isinstance(shape, (list, tuple)):
+                try:
+                    <Py_ssize_t> shape
+                    shape = [shape, ]
+                except Exception as e:
+                    raise TypeError(
+                        "Argument shape must a non-negative integer, "
+			"or a list/tuple of such integers."
+                    ) from e
         nd = len(shape)
         if dtype is None:
             if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)):
diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py
index 500c997e8f..69a1a200df 100644
--- a/dpctl/tensor/_utility_functions.py
+++ b/dpctl/tensor/_utility_functions.py
@@ -3,6 +3,7 @@
 import dpctl
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_reductions_impl as tri
 
 
 def _boolean_reduction(x, axis, keepdims, func):
@@ -94,7 +95,7 @@ def all(x, axis=None, keepdims=False):
             An array with a data type of `bool`
             containing the results of the logical AND reduction.
     """
-    return _boolean_reduction(x, axis, keepdims, ti._all)
+    return _boolean_reduction(x, axis, keepdims, tri._all)
 
 
 def any(x, axis=None, keepdims=False):
@@ -122,4 +123,4 @@ def any(x, axis=None, keepdims=False):
             An array with a data type of `bool`
             containing the results of the logical OR reduction.
     """
-    return _boolean_reduction(x, axis, keepdims, ti._any)
+    return _boolean_reduction(x, axis, keepdims, tri._any)
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index 110010706c..a8ef1c423e 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -125,54 +125,56 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q,
             auto lws = sycl::range<1>(wg_size);
             auto gws = sycl::range<1>(n_groups * wg_size);
 
+            auto ndRange = sycl::nd_range<1>(gws, lws);
+
             slmT slm_iscan_tmp(lws, cgh);
 
-        cgh.parallel_for<class inclusive_scan_rec_local_scan_krn<
-            inputT, outputT, n_wi, IndexerT, decltype(transformer)>>(
-            sycl::nd_range<1>(gws, lws), [=, slm_iscan_tmp = std::move(slm_iscan_tmp)](sycl::nd_item<1> it)
-        {
-            auto chunk_gid = it.get_global_id(0);
-            auto lid = it.get_local_id(0);
+            using KernelName = inclusive_scan_rec_local_scan_krn<
+                inputT, outputT, n_wi, IndexerT, decltype(transformer)>;
+
+            cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp = std::move(
+                                                          slm_iscan_tmp)](
+                                                      sycl::nd_item<1> it) {
+                auto chunk_gid = it.get_global_id(0);
+                auto lid = it.get_local_id(0);
 
-            std::array<size_t, n_wi> local_isum;
+                std::array<size_t, n_wi> local_isum;
 
-            size_t i = chunk_gid * n_wi;
-            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                constexpr outputT out_zero(0);
+                size_t i = chunk_gid * n_wi;
+                for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                    constexpr outputT out_zero(0);
 
-                local_isum[m_wi] =
-                    (i + m_wi < n_elems)
-                        ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
-                        : out_zero;
-            }
+                    local_isum[m_wi] =
+                        (i + m_wi < n_elems)
+                            ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
+                            : out_zero;
+                }
 
-// local_isum is now result of
-// inclusive scan of locally stored mask indicators
 #pragma unroll
-            for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
-                local_isum[m_wi] += local_isum[m_wi - 1];
-            }
+                for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
+                    local_isum[m_wi] += local_isum[m_wi - 1];
+                }
+                // local_isum is now result of
+                // inclusive scan of locally stored inputs
 
-            size_t wg_iscan_val =
-                sycl::inclusive_scan_over_group(it.get_group(),
-                                                local_isum.back(),
-                                                sycl::plus<size_t>(),
-                                                size_t(0));
+                size_t wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_isum.back(), sycl::plus<size_t>(),
+                    size_t(0));
 
-            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
-            it.barrier(sycl::access::fence_space::local_space);
-            size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
-            it.barrier(sycl::access::fence_space::local_space);
+                slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+                it.barrier(sycl::access::fence_space::local_space);
+                size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
 
 #pragma unroll
-            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                local_isum[m_wi] += addand;
-            }
-
-            for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) {
-                output[i + m_wi] = local_isum[m_wi];
-            }
-        });
+                for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                    local_isum[m_wi] += addand;
+                }
+
+                for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi)
+                {
+                    output[i + m_wi] = local_isum[m_wi];
+                }
+            });
         });
 
     sycl::event out_event = inc_scan_phase1_ev;
diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp
new file mode 100644
index 0000000000..9cca9f615b
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp
@@ -0,0 +1,311 @@
+//=== clip.hpp -  Implementation of clip kernels ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.clip.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include "pybind11/numpy.h"
+#include "pybind11/stl.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <pybind11/pybind11.h>
+#include <type_traits>
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace clip
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using namespace dpctl::tensor::offset_utils;
+
+template <typename T> T clip(const T &x, const T &min, const T &max)
+{
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr (is_complex<T>::value) {
+        using dpctl::tensor::math_utils::max_complex;
+        using dpctl::tensor::math_utils::min_complex;
+        return min_complex(max_complex(x, min), max);
+    }
+    else if constexpr (std::is_floating_point_v<T> ||
+                       std::is_same_v<T, sycl::half>) {
+        auto tmp = (std::isnan(x) || x > min) ? x : min;
+        return (std::isnan(tmp) || tmp < max) ? tmp : max;
+    }
+    else if constexpr (std::is_same_v<T, bool>) {
+        return (x || min) && max;
+    }
+    else {
+        auto tmp = (x > min) ? x : min;
+        return (tmp < max) ? tmp : max;
+    }
+}
+
+template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
+{
+private:
+    size_t nelems = 0;
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    ClipContigFunctor(size_t nelems_,
+                      const T *x_p_,
+                      const T *min_p_,
+                      const T *max_p_,
+                      T *dst_p_)
+        : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<T>::value) {
+            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            size_t base = ndit.get_global_linear_id();
+
+            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            for (size_t offset = base;
+                 offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
+                 offset += sgSize)
+            {
+                dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            std::uint8_t sgSize = sg.get_local_range()[0];
+            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
+            size_t base = n_vecs * vec_sz *
+                          (ndit.get_group(0) * ndit.get_local_range(0) +
+                           sg.get_group_id()[0] * max_sgSize);
+
+            if (base + n_vecs * vec_sz * sgSize < nelems &&
+                sgSize == max_sgSize) {
+                sycl::vec<T, vec_sz> x_vec;
+                sycl::vec<T, vec_sz> min_vec;
+                sycl::vec<T, vec_sz> max_vec;
+                sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    auto idx = base + it * sgSize;
+                    auto x_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x_p[idx]);
+                    auto min_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&min_p[idx]);
+                    auto max_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&max_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    x_vec = sg.load<vec_sz>(x_multi_ptr);
+                    min_vec = sg.load<vec_sz>(min_multi_ptr);
+                    max_vec = sg.load<vec_sz>(max_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
+                                               max_vec[vec_id]);
+                    }
+                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
+                }
+            }
+            else {
+                for (size_t k = base + sg.get_local_id()[0]; k < nelems;
+                     k += sgSize) {
+                    dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
+                }
+            }
+        }
+    }
+};
+
+template <typename T, int vec_sz, int n_vecs> class clip_contig_kernel;
+
+typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_contig_impl(sycl::queue &q,
+                             size_t nelems,
+                             const char *x_cp,
+                             const char *min_cp,
+                             const char *max_cp,
+                             char *dst_cp,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        size_t lws = 64;
+        constexpr unsigned int vec_sz = 4;
+        constexpr unsigned int n_vecs = 2;
+        const size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        cgh.parallel_for<clip_contig_kernel<T, vec_sz, n_vecs>>(
+            sycl::nd_range<1>(gws_range, lws_range),
+            ClipContigFunctor<T, vec_sz, n_vecs>(nelems, x_tp, min_tp, max_tp,
+                                                 dst_tp));
+    });
+
+    return clip_ev;
+}
+
+template <typename T, typename IndexerT> class ClipStridedFunctor
+{
+private:
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT indexer;
+
+public:
+    ClipStridedFunctor(const T *x_p_,
+                       const T *min_p_,
+                       const T *max_p_,
+                       T *dst_p_,
+                       IndexerT indexer_)
+        : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        size_t gid = id[0];
+        auto offsets = indexer(static_cast<py::ssize_t>(gid));
+        dst_p[offsets.get_fourth_offset()] = clip(
+            x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
+            max_p[offsets.get_third_offset()]);
+    }
+};
+
+template <typename T, typename IndexerT> class clip_strided_kernel;
+
+typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_strided_impl(sycl::queue &q,
+                              size_t nelems,
+                              int nd,
+                              const char *x_cp,
+                              const char *min_cp,
+                              const char *max_cp,
+                              char *dst_cp,
+                              const py::ssize_t *shape_strides,
+                              py::ssize_t x_offset,
+                              py::ssize_t min_offset,
+                              py::ssize_t max_offset,
+                              py::ssize_t dst_offset,
+                              const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        FourOffsets_StridedIndexer indexer{
+            nd, x_offset, min_offset, max_offset, dst_offset, shape_strides};
+
+        cgh.parallel_for<clip_strided_kernel<T, FourOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            ClipStridedFunctor<T, FourOffsets_StridedIndexer>(
+                x_tp, min_tp, max_tp, dst_tp, indexer));
+    });
+
+    return clip_ev;
+}
+
+template <typename fnT, typename T> struct ClipStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = clip_strided_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct ClipContigFactory
+{
+    fnT get()
+    {
+
+        fnT fn = clip_contig_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace clip
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index 016b3a05d3..d88d17d3e3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -33,6 +33,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -257,6 +258,144 @@ struct BitwiseAndStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        using tu_ns::convert_impl;
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = res && in;
+        }
+        else {
+            res &= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res && in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res &= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseAndInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseAndInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseAndInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseAndInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_and_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+bitwise_and_inplace_contig_impl(sycl::queue &exec_q,
+                                size_t nelems,
+                                const char *arg_p,
+                                py::ssize_t arg_offset,
+                                char *res_p,
+                                py::ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseAndInplaceContigFunctor,
+        bitwise_and_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
+                                           res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseAndOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_and_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_and_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseAndInplaceStridedFunctor,
+        bitwise_and_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseAndOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace bitwise_and
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index 9ce56be966..ed4aeeb59e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -35,6 +35,8 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
+#include "kernels/elementwise_functions/common.hpp"
+
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index 4ae04f97de..5cfd6ca5e3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -34,6 +34,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -268,6 +269,150 @@ struct BitwiseLeftShiftStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct BitwiseLeftShiftInplaceFunctor
+{
+    static_assert(std::is_integral_v<argT>);
+    static_assert(!std::is_same_v<argT, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        impl(res, in);
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            impl(res[i], in[i]);
+        }
+    }
+
+private:
+    void impl(resT &res, const argT &in) const
+    {
+        constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT>) {
+            (in < res_bitsize) ? (res <<= in) : res = zero;
+        }
+        else {
+            (in < argT(0)) ? res = zero
+                           : ((in < res_bitsize) ? (res <<= in) : res = zero);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseLeftShiftInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseLeftShiftInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseLeftShiftInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseLeftShiftInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_left_shift_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_left_shift_inplace_contig_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseLeftShiftInplaceContigFunctor,
+        bitwise_left_shift_inplace_contig_kernel>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseLeftShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_left_shift_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_left_shift_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseLeftShiftInplaceStridedFunctor,
+        bitwise_left_shift_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseLeftShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace bitwise_left_shift
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index 65f25dd296..d5669d41b1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -33,6 +33,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -253,6 +254,144 @@ template <typename fnT, typename T1, typename T2> struct BitwiseOrStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct BitwiseOrInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        using tu_ns::convert_impl;
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = res || in;
+        }
+        else {
+            res |= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res || in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res |= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseOrInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseOrInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseOrInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseOrInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_or_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
+                               size_t nelems,
+                               const char *arg_p,
+                               py::ssize_t arg_offset,
+                               char *res_p,
+                               py::ssize_t res_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseOrInplaceContigFunctor,
+        bitwise_or_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
+                                          res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseOrOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_or_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_or_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseOrInplaceStridedFunctor,
+        bitwise_or_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseOrOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace bitwise_or
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 9442d4f6b7..5a04165701 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -34,6 +34,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -270,6 +271,152 @@ struct BitwiseRightShiftStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct BitwiseRightShiftInplaceFunctor
+{
+    static_assert(std::is_integral_v<argT>);
+    static_assert(!std::is_same_v<argT, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        impl(res, in);
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            impl(res[i], in[i]);
+        }
+    }
+
+private:
+    void impl(resT &res, const argT &in) const
+    {
+        constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT>) {
+            (in < res_bitsize) ? (res >>= in) : res = zero;
+        }
+        else {
+            (in < argT(0)) ? res = zero
+                           : ((in < res_bitsize) ? (res >>= in)
+                              : (res < resT(0))  ? res = resT(-1)
+                                                 : res = zero);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseRightShiftInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseRightShiftInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseRightShiftInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseRightShiftInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_right_shift_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_right_shift_inplace_contig_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseRightShiftInplaceContigFunctor,
+        bitwise_right_shift_inplace_contig_kernel>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseRightShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_right_shift_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_right_shift_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseRightShiftInplaceStridedFunctor,
+        bitwise_right_shift_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseRightShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace bitwise_right_shift
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index 2b0ab09dca..ec8192fd0f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -33,6 +33,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -257,6 +258,144 @@ struct BitwiseXorStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct BitwiseXorInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        using tu_ns::convert_impl;
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = (res != in);
+        }
+        else {
+            res ^= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res != in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res ^= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseXorInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseXorInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseXorInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseXorInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_xor_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+bitwise_xor_inplace_contig_impl(sycl::queue &exec_q,
+                                size_t nelems,
+                                const char *arg_p,
+                                py::ssize_t arg_offset,
+                                char *res_p,
+                                py::ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseXorInplaceContigFunctor,
+        bitwise_xor_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
+                                           res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseXorOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_xor_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_xor_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseXorInplaceStridedFunctor,
+        bitwise_xor_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseXorOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace bitwise_xor
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
new file mode 100644
index 0000000000..1d4aa65002
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -0,0 +1,172 @@
+//=== cbrt.hpp -   Unary function CBRT                   ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CBRT(x)
+/// function that compute a square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace cbrt
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT> struct CbrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::cbrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using CbrtContigFunctor = elementwise_common::
+    UnaryContigFunctor<argTy, resTy, CbrtFunctor<argTy, resTy>, vec_sz, n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CbrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CbrtFunctor<argTy, resTy>>;
+
+template <typename T> struct CbrtOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class cbrt_contig_kernel;
+
+template <typename argTy>
+sycl::event cbrt_contig_impl(sycl::queue &exec_q,
+                             size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct CbrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename CbrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct CbrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::cbrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CbrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class cbrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+cbrt_strided_impl(sycl::queue &exec_q,
+                  size_t nelems,
+                  int nd,
+                  const py::ssize_t *shape_and_strides,
+                  const char *arg_p,
+                  py::ssize_t arg_offset,
+                  char *res_p,
+                  py::ssize_t res_offset,
+                  const std::vector<sycl::event> &depends,
+                  const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct CbrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename CbrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace cbrt
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
new file mode 100644
index 0000000000..b1997d06b4
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -0,0 +1,215 @@
+//=== copysign.hpp -   Binary function COPYSIGN         ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copysign
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::copysign(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+    operator()(const sycl::vec<argT1, vec_sz> &in1,
+               const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = sycl::copysign(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using CopysignContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            CopysignFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    CopysignFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct CopysignOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class copysign_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event copysign_contig_impl(sycl::queue &exec_q,
+                                 size_t nelems,
+                                 const char *arg1_p,
+                                 py::ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 py::ssize_t arg2_offset,
+                                 char *res_p,
+                                 py::ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
+        copysign_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct CopysignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename CopysignOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct CopysignTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CopysignOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class copysign_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+copysign_strided_impl(sycl::queue &exec_q,
+                      size_t nelems,
+                      int nd,
+                      const py::ssize_t *shape_and_strides,
+                      const char *arg1_p,
+                      py::ssize_t arg1_offset,
+                      const char *arg2_p,
+                      py::ssize_t arg2_offset,
+                      char *res_p,
+                      py::ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor,
+        copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct CopysignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename CopysignOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace copysign
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
new file mode 100644
index 0000000000..67ee23df48
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -0,0 +1,229 @@
+//=== exp2.hpp -   Unary function EXP2                     ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXP2(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace exp2
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT> struct Exp2Functor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            const argT tmp = in * std::log(realT(2));
+
+            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(tmp);
+            const realT y = std::imag(tmp);
+            if (std::isfinite(x)) {
+                if (std::isfinite(y)) {
+                    return std::exp(tmp);
+                }
+                else {
+                    return resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(x)) {
+                /* x is nan */
+                if (y == realT(0)) {
+                    return resT{in};
+                }
+                else {
+                    return resT{x, q_nan};
+                }
+            }
+            else {
+                if (!std::signbit(x)) { /* x is +inf */
+                    if (y == realT(0)) {
+                        return resT{x, y};
+                    }
+                    else if (std::isfinite(y)) {
+                        return resT{x * std::cos(y), x * std::sin(y)};
+                    }
+                    else {
+                        /* x = +inf, y = +-inf || nan */
+                        return resT{x, q_nan};
+                    }
+                }
+                else { /* x is -inf */
+                    if (std::isfinite(y)) {
+                        realT exp_x = std::exp(x);
+                        return resT{exp_x * std::cos(y), exp_x * std::sin(y)};
+                    }
+                    else {
+                        /* x = -inf, y = +-inf || nan */
+                        return resT{0, 0};
+                    }
+                }
+            }
+        }
+        else {
+            return sycl::exp2(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using Exp2ContigFunctor = elementwise_common::
+    UnaryContigFunctor<argTy, resTy, Exp2Functor<argTy, resTy>, vec_sz, n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Exp2StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Exp2Functor<argTy, resTy>>;
+
+template <typename T> struct Exp2OutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class exp2_contig_kernel;
+
+template <typename argTy>
+sycl::event exp2_contig_impl(sycl::queue &exec_q,
+                             size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct Exp2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename Exp2OutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct Exp2TypeMapFactory
+{
+    /*! @brief get typeid for output type of std::exp2(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Exp2OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class exp2_strided_kernel;
+
+template <typename argTy>
+sycl::event
+exp2_strided_impl(sycl::queue &exec_q,
+                  size_t nelems,
+                  int nd,
+                  const py::ssize_t *shape_and_strides,
+                  const char *arg_p,
+                  py::ssize_t arg_offset,
+                  char *res_p,
+                  py::ssize_t res_offset,
+                  const std::vector<sycl::event> &depends,
+                  const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct Exp2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename Exp2OutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace exp2
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index ad75924070..241c0e7ca8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -34,6 +34,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -57,12 +58,7 @@ struct FloorDivideFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        if constexpr (std::is_same_v<argT1, bool> &&
-                      std::is_same_v<argT2, bool>) {
-            return (in2) ? static_cast<resT>(in1) : resT(0);
-        }
-        else if constexpr (std::is_integral_v<argT1> ||
-                           std::is_integral_v<argT2>) {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
             if (in2 == argT2(0)) {
                 return resT(0);
             }
@@ -87,16 +83,7 @@ struct FloorDivideFunctor
     operator()(const sycl::vec<argT1, vec_sz> &in1,
                const sycl::vec<argT2, vec_sz> &in2) const
     {
-        if constexpr (std::is_same_v<argT1, bool> &&
-                      std::is_same_v<argT2, bool>) {
-            sycl::vec<resT, vec_sz> res;
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                res[i] = (in2[i]) ? static_cast<resT>(in1[i]) : resT(0);
-            }
-            return res;
-        }
-        else if constexpr (std::is_integral_v<resT>) {
+        if constexpr (std::is_integral_v<resT>) {
             sycl::vec<resT, vec_sz> res;
 #pragma unroll
             for (int i = 0; i < vec_sz; ++i) {
@@ -165,7 +152,6 @@ template <typename T1, typename T2> struct FloorDivideOutputType
 {
     using value_type = typename std::disjunction< // disjunction is C++17
                                                   // feature, supported by DPC++
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, std::int8_t>,
         td_ns::BinaryTypeMapResultEntry<T1,
                                         std::uint8_t,
                                         T2,
@@ -315,6 +301,183 @@ struct FloorDivideStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct FloorDivideInplaceFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    void operator()(resT &in1, const argT &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+            if (in2 == argT(0)) {
+                in1 = 0;
+                return;
+            }
+            if constexpr (std::is_signed_v<resT>) {
+                auto tmp = in1;
+                in1 /= in2;
+                auto mod = tmp % in2;
+                auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
+                in1 -= corr;
+            }
+            else {
+                in1 /= in2;
+            }
+        }
+        else {
+            in1 /= in2;
+            if (in1 == resT(0)) {
+                return;
+            }
+            in1 = std::floor(in1);
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &in1,
+                    const sycl::vec<argT, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT(0)) {
+                    in1[i] = 0;
+                }
+                else {
+                    if constexpr (std::is_signed_v<resT>) {
+                        auto tmp = in1[i];
+                        in1[i] /= in2[i];
+                        auto mod = tmp % in2[i];
+                        auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
+                        in1[i] -= corr;
+                    }
+                    else {
+                        in1[i] /= in2[i];
+                    }
+                }
+            }
+        }
+        else {
+            in1 /= in2;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] != argT(0)) {
+                    in1[i] = std::floor(in1[i]);
+                }
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using FloorDivideInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        FloorDivideInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using FloorDivideInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        FloorDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class floor_divide_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+floor_divide_inplace_contig_impl(sycl::queue &exec_q,
+                                 size_t nelems,
+                                 const char *arg_p,
+                                 py::ssize_t arg_offset,
+                                 char *res_p,
+                                 py::ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, FloorDivideInplaceContigFunctor,
+        floor_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
+                                            res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename FloorDivideOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class floor_divide_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event floor_divide_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, FloorDivideInplaceStridedFunctor,
+        floor_divide_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename FloorDivideOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace floor_divide
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 90b7997a37..6a187da6f4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -31,6 +31,7 @@
 #include <limits>
 #include <type_traits>
 
+#include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
@@ -61,7 +62,8 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        return impl<resT>(in1, in2);
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<resT>(in1, in2);
     }
 
     template <int vec_sz>
@@ -79,7 +81,8 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
                          impl_finite<resT>(-std::abs(diff[i]));
             }
             else {
-                res[i] = impl<resT>(in1[i], in2[i]);
+                using dpctl::tensor::math_utils::logaddexp;
+                res[i] = logaddexp<resT>(in1[i], in2[i]);
             }
         }
 
@@ -87,26 +90,6 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
     }
 
 private:
-    template <typename T> T impl(T const &in1, T const &in2) const
-    {
-        if (in1 == in2) { // handle signed infinities
-            const T log2 = std::log(T(2));
-            return in1 + log2;
-        }
-        else {
-            const T tmp = in1 - in2;
-            if (tmp > 0) {
-                return in1 + std::log1p(std::exp(-tmp));
-            }
-            else if (tmp <= 0) {
-                return in2 + std::log1p(std::exp(tmp));
-            }
-            else {
-                return std::numeric_limits<T>::quiet_NaN();
-            }
-        }
-    }
-
     template <typename T> T impl_finite(T const &in) const
     {
         return (in > 0) ? (in + std::log1p(std::exp(-in)))
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index ba9241b8db..6654bae384 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -35,6 +35,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -55,31 +56,30 @@ template <typename argT1, typename argT2, typename resT> struct PowFunctor
 
     using supports_sg_loadstore = std::negation<
         std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec =
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>,
-                                       std::is_integral<argT1>,
-                                       std::is_integral<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
 
-    resT operator()(argT1 in1, argT2 in2) const
+    resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            auto tmp1 = in1;
+            auto tmp2 = in2;
             if constexpr (std::is_signed_v<argT2>) {
-                if (in2 < 0) {
+                if (tmp2 < 0) {
                     // invalid; return 0
                     return resT(0);
                 }
             }
             resT res = 1;
-            if (in1 == 1 || in2 == 0) {
+            if (tmp1 == 1 || tmp2 == 0) {
                 return res;
             }
-            while (in2 > 0) {
-                if (in2 & 1) {
-                    res *= in1;
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res *= tmp1;
                 }
-                in2 >>= 1;
-                in1 *= in1;
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
             }
             return res;
         }
@@ -93,16 +93,48 @@ template <typename argT1, typename argT2, typename resT> struct PowFunctor
     operator()(const sycl::vec<argT1, vec_sz> &in1,
                const sycl::vec<argT2, vec_sz> &in2) const
     {
-        auto res = sycl::pow(in1, in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(res)::element_type>) {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = in1[i];
+                auto tmp2 = in2[i];
+                if constexpr (std::is_signed_v<argT2>) {
+                    if (tmp2 < 0) {
+                        // invalid; yield 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                resT res_tmp = 1;
+                if (tmp1 == 1 || tmp2 == 0) {
+                    res[i] = res_tmp;
+                    continue;
+                }
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
             return res;
         }
         else {
-            using dpctl::tensor::type_utils::vec_cast;
+            auto res = sycl::pow(in1, in2);
+            if constexpr (std::is_same_v<resT,
+                                         typename decltype(res)::element_type>)
+            {
+                return res;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
 
-            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
-                res);
+                return vec_cast<resT, typename decltype(res)::element_type,
+                                vec_sz>(res);
+            }
         }
     }
 };
@@ -128,10 +160,6 @@ using PowStridedFunctor =
                                              IndexerT,
                                              PowFunctor<argT1, argT2, resT>>;
 
-// TODO: when type promotion logic is better defined,
-// consider implementing overloads of std::pow that take
-// integers for the exponents. Seem to give better accuracy in
-// some cases (complex data especially)
 template <typename T1, typename T2> struct PowOutputType
 {
     using value_type = typename std::disjunction< // disjunction is C++17
@@ -286,6 +314,184 @@ template <typename fnT, typename T1, typename T2> struct PowStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct PowInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            auto tmp1 = res;
+            auto tmp2 = in;
+            if constexpr (std::is_signed_v<argT>) {
+                if (tmp2 < 0) {
+                    // invalid; return 0
+                    res = 0;
+                    return;
+                }
+            }
+            if (tmp1 == 1) {
+                return;
+            }
+            if (tmp2 == 0) {
+                res = 1;
+                return;
+            }
+            resT res_tmp = 1;
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res_tmp *= tmp1;
+                }
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
+            }
+            res = res_tmp;
+            return;
+        }
+        else {
+            res = std::pow(res, in);
+        };
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = res[i];
+                auto tmp2 = in[i];
+                if constexpr (std::is_signed_v<argT>) {
+                    if (tmp2 < 0) {
+                        // invalid; return 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                if (tmp1 == 1) {
+                    continue;
+                }
+                if (tmp2 == 0) {
+                    res[i] = 1;
+                    continue;
+                }
+                resT res_tmp = 1;
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
+        }
+        else {
+            res = sycl::pow(res, in);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
+    argT,
+    resT,
+    PowInplaceFunctor<argT, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using PowInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        PowInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class pow_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+pow_inplace_contig_impl(sycl::queue &exec_q,
+                        size_t nelems,
+                        const char *arg_p,
+                        py::ssize_t arg_offset,
+                        char *res_p,
+                        py::ssize_t res_offset,
+                        const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct PowInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename PowOutputType<T1, T2>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class pow_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+pow_inplace_strided_impl(sycl::queue &exec_q,
+                         size_t nelems,
+                         int nd,
+                         const py::ssize_t *shape_and_strides,
+                         const char *arg_p,
+                         py::ssize_t arg_offset,
+                         char *res_p,
+                         py::ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename PowOutputType<T1, T2>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace pow
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 6cd306a900..051a1f9029 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -35,6 +35,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -313,6 +314,194 @@ template <typename fnT, typename T1, typename T2> struct RemainderStridedFactory
     }
 };
 
+template <typename argT, typename resT> struct RemainderInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    // functor is only well-defined when argT and resT are the same
+    static_assert(std::is_same_v<argT, resT>);
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            if (in == argT(0)) {
+                res = 0;
+                return;
+            }
+            if constexpr (std::is_signed_v<argT> || std::is_signed_v<resT>) {
+                auto tmp = res;
+                res %= in;
+                if (res != resT(0) && l_xor(tmp < 0, in < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res %= in;
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+            if (res) {
+                if (l_xor(in < 0, res < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res = sycl::copysign(resT(0), in);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (in[i] == argT(0)) {
+                    res[i] = 0;
+                }
+                else {
+                    auto rem = res[i] % in[i];
+                    if constexpr (std::is_signed_v<argT> ||
+                                  std::is_signed_v<resT>) {
+                        if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) {
+                            rem += in[i];
+                        }
+                    }
+                    res[i] = rem;
+                }
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (res[i]) {
+                    if (l_xor(in[i] < 0, res[i] < 0)) {
+                        res[i] += in[i];
+                    }
+                }
+                else {
+                    res[i] = sycl::copysign(resT(0), in[i]);
+                }
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using RemainderInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        RemainderInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using RemainderInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        RemainderInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class remainder_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+remainder_inplace_contig_impl(sycl::queue &exec_q,
+                              size_t nelems,
+                              const char *arg_p,
+                              py::ssize_t arg_offset,
+                              char *res_p,
+                              py::ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, RemainderInplaceContigFunctor,
+        remainder_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
+                                         res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename RemainderOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class remainder_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event remainder_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, RemainderInplaceStridedFunctor,
+        remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                          arg_p, arg_offset, res_p, res_offset,
+                                          depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename RemainderOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
 } // namespace remainder
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
new file mode 100644
index 0000000000..de51b31c30
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,179 @@
+//=== rsqrt.hpp -   Unary function RSQRT                   ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of RSQRT(x)
+/// function that computes the reciprocal square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace rsqrt
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT> struct RsqrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::rsqrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using RsqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RsqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RsqrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RsqrtFunctor<argTy, resTy>>;
+
+template <typename T> struct RsqrtOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class rsqrt_contig_kernel;
+
+template <typename argTy>
+sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
+                              size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct RsqrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename RsqrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct RsqrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::rsqrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RsqrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class rsqrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+rsqrt_strided_impl(sycl::queue &exec_q,
+                   size_t nelems,
+                   int nd,
+                   const py::ssize_t *shape_and_strides,
+                   const char *arg_p,
+                   py::ssize_t arg_offset,
+                   char *res_p,
+                   py::ssize_t res_offset,
+                   const std::vector<sycl::event> &depends,
+                   const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct RsqrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename RsqrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace rsqrt
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index 3eb8420933..e4ae857738 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -34,6 +34,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 9f488e6598..86fb0ca2e2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -34,6 +34,7 @@
 #include "utils/type_utils.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
 #include <pybind11/pybind11.h>
 
 namespace dpctl
@@ -370,6 +371,233 @@ struct TrueDivideContigRowContigMatrixBroadcastFactory
     }
 };
 
+template <typename argT, typename resT> struct TrueDivideInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        res /= in;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res /= in;
+    }
+};
+
+// cannot use the out of place table, as it permits real lhs and complex rhs
+// T1 corresponds to the type of the rhs, while T2 corresponds to the lhs
+// the type of the result must be the same as T2
+template <typename T1, typename T2> struct TrueDivideInplaceOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        float,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        double,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TrueDivideInplaceOutputType<T1, T2>::value_type;
+        static_assert(std::is_same_v<rT, T2> || std::is_same_v<rT, void>);
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using TrueDivideInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        TrueDivideInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs>;
+
+template <typename argT, typename resT, typename IndexerT>
+using TrueDivideInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        TrueDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class true_divide_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+true_divide_inplace_contig_impl(sycl::queue &exec_q,
+                                size_t nelems,
+                                const char *arg_p,
+                                py::ssize_t arg_offset,
+                                char *res_p,
+                                py::ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, TrueDivideInplaceContigFunctor,
+        true_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
+                                           res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename TrueDivideInplaceOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class true_divide_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event true_divide_inplace_strided_impl(
+    sycl::queue &exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg_p,
+    py::ssize_t arg_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, TrueDivideInplaceStridedFunctor,
+        true_divide_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename TrueDivideInplaceOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class true_divide_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using TrueDivideInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        TrueDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event true_divide_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    size_t n0,
+    size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    py::ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    py::ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, TrueDivideInplaceRowMatrixBroadcastingFunctor,
+        true_divide_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        using resT = typename TrueDivideInplaceOutputType<T1, T2>::value_type;
+        if constexpr (!std::is_same_v<resT, T2>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = true_divide_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
 } // namespace true_divide
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 7dfc956492..adbf96be10 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -24,6 +24,7 @@
 
 #pragma once
 #include <CL/sycl.hpp>
+#include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
@@ -32,6 +33,7 @@
 #include <vector>
 
 #include "pybind11/pybind11.h"
+#include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/sycl_utils.hpp"
 #include "utils/type_dispatch.hpp"
@@ -39,6 +41,7 @@
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
 
 namespace dpctl
 {
@@ -47,6 +50,20 @@ namespace tensor
 namespace kernels
 {
 
+template <typename ReductionOpT, typename T> struct needs_workaround
+{
+    static constexpr bool value =
+        std::is_same_v<ReductionOpT, sycl::multiplies<T>> &&
+        (std::is_same_v<T, std::int64_t> || std::is_same_v<T, std::uint64_t>);
+};
+
+template <typename ReductionOpT, typename T> struct can_use_reduce_over_group
+{
+    static constexpr bool value =
+        sycl::has_known_identity<ReductionOpT, T>::value &&
+        !needs_workaround<ReductionOpT, T>::value;
+};
+
 template <typename argT,
           typename outT,
           typename ReductionOp,
@@ -94,7 +111,9 @@ struct SequentialReduction
             const py::ssize_t inp_offset =
                 inp_iter_offset + inp_reduction_offset;
 
-            red_val = reduction_op_(red_val, inp_[inp_offset]);
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, argT>(inp_[inp_offset]);
+            red_val = reduction_op_(red_val, val);
         }
 
         out_[out_iter_offset] = red_val;
@@ -153,7 +172,7 @@ struct ReductionOverGroupWithAtomicFunctor
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
-        // work-items sums over input with indices
+        // work-items operate over input with indices
         //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
         //   + reduction_lid
         // for 0 <= m < reductions_per_wi
@@ -191,11 +210,17 @@ struct ReductionOverGroupWithAtomicFunctor
                              sycl::memory_scope::device,
                              sycl::access::address_space::global_space>
                 res_ref(out_[out_iter_offset]);
-            if constexpr (std::is_same_v<ReductionOp, std::plus<outT>> ||
-                          std::is_same_v<ReductionOp, sycl::plus<outT>>)
-            {
+            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
                 res_ref += red_val_over_wg;
             }
+            else if constexpr (std::is_same_v<ReductionOp, sycl::maximum<outT>>)
+            {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (std::is_same_v<ReductionOp, sycl::minimum<outT>>)
+            {
+                res_ref.fetch_min(red_val_over_wg);
+            }
             else {
                 outT read_val = res_ref.load();
                 outT new_val{};
@@ -207,7 +232,103 @@ struct ReductionOverGroupWithAtomicFunctor
     }
 };
 
-typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
+/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        ReductionOp reduction_op,
+        const outT &identity_val,
+        InputOutputIterIndexerT arg_res_iter_indexer,
+        InputRedIndexerT arg_reduced_dims_indexer,
+        SlmT local_mem,
+        size_t reduction_size,
+        size_t iteration_size,
+        size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, argT>(inp_[inp_offset]);
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            outT read_val = res_ref.load();
+            outT new_val{};
+            do {
+                new_val = reduction_op_(read_val, red_val_over_wg);
+            } while (!res_ref.compare_exchange_strong(read_val, new_val));
+        }
+    }
+};
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
     sycl::queue &,
     size_t,
     size_t,
@@ -223,27 +344,51 @@ typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
     const std::vector<sycl::event> &);
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_over_group_with_atomics_krn;
+class reduction_over_group_with_atomics_krn;
 
-template <typename T1, typename T2>
-class sum_reduction_over_group_with_atomics_init_krn;
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_over_group_with_atomics_krn;
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_with_atomics_init_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_seq_strided_krn;
+class reduction_seq_strided_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_seq_contig_krn;
+class reduction_seq_contig_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_axis0_over_group_with_atomics_contig_krn;
+class reduction_axis0_over_group_with_atomics_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_axis0_over_group_with_atomics_contig_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_axis1_over_group_with_atomics_contig_krn;
+class reduction_axis1_over_group_with_atomics_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_axis1_over_group_with_atomics_contig_krn;
 
 using dpctl::tensor::sycl_utils::choose_workgroup_size;
 
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_with_atomics_strided_impl(
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_with_atomics_strided_impl(
     sycl::queue &exec_q,
     size_t iter_nelems, // number of reductions    (num. of rows in a matrix
                         // when reducing over rows)
@@ -263,8 +408,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
 
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -285,7 +429,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
             ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
                                                 reduction_shape_stride};
 
-            cgh.parallel_for<class sum_reduction_seq_strided_krn<
+            cgh.parallel_for<class reduction_seq_strided_krn<
                 argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                 ReductionIndexerT>>(
                 sycl::range<1>(iter_nelems),
@@ -308,8 +452,8 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
             IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
                                  res_strides);
             using InitKernelName =
-                class sum_reduction_over_group_with_atomics_init_krn<resTy,
-                                                                     argTy>;
+                class reduction_over_group_with_atomics_init_krn<resTy, argTy,
+                                                                 ReductionOpT>;
             cgh.depends_on(depends);
 
             cgh.parallel_for<InitKernelName>(
@@ -333,11 +477,11 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
             ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
                                                 reduction_shape_stride};
 
-            constexpr size_t preferrered_reductions_per_wi = 4;
+            constexpr size_t preferred_reductions_per_wi = 8;
             size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                (reduction_nelems < preferred_reductions_per_wi * wg)
                     ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
+                    : preferred_reductions_per_wi;
 
             size_t reduction_groups =
                 (reduction_nelems + reductions_per_wi * wg - 1) /
@@ -347,18 +491,38 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_with_atomics_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class reduction_over_group_with_atomics_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
 
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_reduction_over_group_with_atomics_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
         });
 
         return comp_ev;
@@ -367,7 +531,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
 
 // Contig
 
-typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
+typedef sycl::event (*reduction_contig_impl_fn_ptr)(
     sycl::queue &,
     size_t,
     size_t,
@@ -379,8 +543,8 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
     const std::vector<sycl::event> &);
 
 /* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
     sycl::queue &exec_q,
     size_t iter_nelems, // number of reductions    (num. of rows in a matrix
                         // when reducing over rows)
@@ -397,8 +561,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -422,7 +585,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                 NoOpIndexerT{}};
             ReductionIndexerT reduction_indexer{};
 
-            cgh.parallel_for<class sum_reduction_seq_contig_krn<
+            cgh.parallel_for<class reduction_seq_contig_krn<
                 argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                 ReductionIndexerT>>(
                 sycl::range<1>(iter_nelems),
@@ -456,11 +619,11 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                                                         result_indexer};
             ReductionIndexerT reduction_indexer{};
 
-            constexpr size_t preferrered_reductions_per_wi = 8;
+            constexpr size_t preferred_reductions_per_wi = 8;
             size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                (reduction_nelems < preferred_reductions_per_wi * wg)
                     ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
+                    : preferred_reductions_per_wi;
 
             size_t reduction_groups =
                 (reduction_nelems + reductions_per_wi * wg - 1) /
@@ -470,28 +633,47 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName =
-                class sum_reduction_axis1_over_group_with_atomics_contig_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT>;
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName =
+                    class reduction_axis1_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>;
 
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class
+                    custom_reduction_axis1_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
         });
-
         return comp_ev;
     }
 }
 
 /* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
     sycl::queue &exec_q,
     size_t iter_nelems, // number of reductions    (num. of cols in a matrix
                         // when reducing over cols)
@@ -508,14 +690,47 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-    {
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                        NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            using KernelName =
+                class reduction_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                               InputOutputIterIndexerT,
+                                               ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+    else {
         sycl::event res_init_ev = exec_q.fill<resTy>(
             res_tp, resTy(identity_val), iter_nelems, depends);
 
@@ -537,11 +752,11 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
                 0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
                 /* step */ static_cast<py::ssize_t>(iter_nelems)};
 
-            constexpr size_t preferrered_reductions_per_wi = 8;
+            constexpr size_t preferred_reductions_per_wi = 8;
             size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                (reduction_nelems < preferred_reductions_per_wi * wg)
                     ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
+                    : preferred_reductions_per_wi;
 
             size_t reduction_groups =
                 (reduction_nelems + reductions_per_wi * wg - 1) /
@@ -551,21 +766,40 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName =
-                class sum_reduction_axis0_over_group_with_atomics_contig_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT>;
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName =
+                    class reduction_axis0_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>;
 
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class
+                    custom_reduction_axis0_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
         });
-
         return comp_ev;
     }
 }
@@ -618,7 +852,7 @@ struct ReductionOverGroupNoAtomicFunctor
         const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
         const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
 
-        // work-items sums over input with indices
+        // work-items operates over input with indices
         //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
         //   + reduction_lid
         // for 0 <= m < reductions_per_wi
@@ -658,95 +892,4368 @@ struct ReductionOverGroupNoAtomicFunctor
     }
 };
 
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_over_group_temps_krn;
+/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/
 
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_temps_strided_impl(
-    sycl::queue &exec_q,
-    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
-                        // when reducing over rows)
-    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
-                             // number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    int iter_nd,
-    const py::ssize_t *iter_shape_and_strides,
-    py::ssize_t iter_arg_offset,
-    py::ssize_t iter_res_offset,
-    int red_nd,
-    const py::ssize_t *reduction_shape_stride,
-    py::ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupNoAtomicFunctor
 {
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
 
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+public:
+    CustomReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        ReductionOp reduction_op,
+        const outT &identity_val,
+        InputOutputIterIndexerT arg_res_iter_indexer,
+        InputRedIndexerT arg_reduced_dims_indexer,
+        SlmT local_mem,
+        size_t reduction_size,
+        size_t iteration_size,
+        size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
 
-    constexpr size_t preferrered_reductions_per_wi = 4;
-    size_t max_wg = d.get_info<sycl::info::device::max_work_group_size>();
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requires 1 work-group, can output directly to res
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
 
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
 
-            InputOutputIterIndexerT in_out_iter_indexer{
-                iter_nd, iter_arg_offset, iter_res_offset,
-                iter_shape_and_strides};
-            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                reduction_shape_stride};
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
 
-            wg = max_wg;
-            reductions_per_wi =
-                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+        outT local_red_val(identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (size_t m = 0; m < reductions_per_wi; ++m) {
+            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
 
-            size_t reduction_groups =
-                (reduction_nelems + reductions_per_wi * wg - 1) /
-                (reductions_per_wi * wg);
-            assert(reduction_groups == 1);
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
 
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val = convert_impl<outT, argT>(inp_[inp_offset]);
 
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        });
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
 
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_temps_strided_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_over_group_temps_strided_krn;
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_temps_empty_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class single_reduction_axis0_temps_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class first_reduction_axis0_temps_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class middle_reduction_axis0_temps_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class final_reduction_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class single_custom_reduction_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class first_custom_reduction_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class middle_custom_reduction_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class final_custom_reduction_axis0_temps_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class single_reduction_axis1_temps_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class first_reduction_axis1_temps_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class middle_reduction_axis1_temps_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class final_reduction_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class single_custom_reduction_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class first_custom_reduction_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class middle_custom_reduction_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class final_custom_reduction_axis1_temps_contig_krn;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const py::ssize_t *iter_shape_and_strides,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    int red_nd,
+    const py::ssize_t *reduction_shape_stride,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const py::ssize_t *const &res_shape = iter_shape_and_strides;
+            const py::ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                 res_strides);
+            using InitKernelName =
+                class reduction_over_group_temps_empty_krn<resTy, argTy,
+                                                           ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            cgh.parallel_for<class reduction_seq_strided_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 8;
+    // max_max_wg prevents running out of resources on CPU
+    constexpr size_t max_max_wg = 2048;
+    size_t max_wg = std::min(
+        max_max_wg, d.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class reduction_over_group_temps_strided_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_reduction_over_group_temps_strided_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+        });
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        resTy *partially_reduced_tmp2 = nullptr;
+
+        if (partially_reduced_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_groups * iter_nelems;
+        }
+
+        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                      &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of
+            // iterated dimensions of input array from
+            // iter_shape_and_strides are going to be accessed by
+            // inp_indexer
+            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                      iter_shape_and_strides);
+            ResIndexerT noop_tmp_indexer{};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        noop_tmp_indexer};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class reduction_over_group_temps_strided_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_reduction_over_group_temps_strided_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        local_memory, reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+        });
+
+        size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
+                    using KernelName =
+                        class reduction_over_group_temps_strided_krn<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        ReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            remaining_reduction_nelems, iter_nelems,
+                            preferred_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<resTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class custom_reduction_over_group_temps_strided_krn<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            local_memory, remaining_reduction_nelems,
+                            iter_nelems, preferred_reductions_per_wi));
+                }
+            });
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_ev);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            InputIndexerT inp_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(remaining_reduction_nelems)};
+            ResIndexerT res_iter_indexer{iter_nd, iter_res_offset,
+                                         /* shape */ iter_shape_and_strides,
+                                         /* strides */ iter_shape_and_strides +
+                                             2 * iter_nd};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        res_iter_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class reduction_over_group_temps_strided_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer,
+                                           remaining_reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_reduction_over_group_temps_strided_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        temp_arg, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(final_reduction_ev);
+                const sycl::context &ctx = exec_q.get_context();
+
+                cgh.host_task([ctx, partially_reduced_tmp] {
+                    sycl::free(partially_reduced_tmp, ctx);
+                });
+            });
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            cgh.parallel_for<class reduction_seq_contig_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 8;
+    // max_max_wg prevents running out of resources on CPU
+    constexpr size_t max_max_wg = 2048;
+    size_t max_wg = std::min(
+        max_max_wg, d.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName =
+                    class single_reduction_axis1_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class single_custom_reduction_axis1_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+        });
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        resTy *partially_reduced_tmp2 = nullptr;
+
+        if (partially_reduced_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_groups * iter_nelems;
+        }
+
+        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                      &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    RowsIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            RowsIndexerT rows_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(reduction_nelems)};
+            NoOpIndexerT noop_tmp_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                        noop_tmp_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class first_reduction_axis1_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class first_custom_reduction_axis1_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        local_memory, reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+        });
+
+        size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
+                    using KernelName =
+                        class middle_reduction_axis1_temps_contig_krn<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        ReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            remaining_reduction_nelems, iter_nelems,
+                            preferred_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<resTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class middle_custom_reduction_axis1_temps_contig_krn<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            local_memory, remaining_reduction_nelems,
+                            iter_nelems, preferred_reductions_per_wi));
+                }
+            });
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_ev);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            InputIndexerT inp_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(remaining_reduction_nelems)};
+            ResIndexerT res_iter_indexer{};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        res_iter_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class final_reduction_axis1_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer,
+                                           remaining_reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class final_custom_reduction_axis1_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        temp_arg, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(final_reduction_ev);
+                const sycl::context &ctx = exec_q.get_context();
+
+                cgh.host_task([ctx, partially_reduced_tmp] {
+                    sycl::free(partially_reduced_tmp, ctx);
+                });
+            });
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                        NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            using KernelName =
+                class reduction_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                               InputOutputIterIndexerT,
+                                               ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 8;
+    // max_max_wg prevents running out of resources on CPU
+    constexpr size_t max_max_wg = 2048;
+    size_t max_wg = std::min(
+        max_max_wg, d.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
+                /* step */ static_cast<py::ssize_t>(iter_nelems)};
+
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName =
+                    class single_reduction_axis0_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class single_custom_reduction_axis0_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+        });
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        resTy *partially_reduced_tmp2 = nullptr;
+
+        if (partially_reduced_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_groups * iter_nelems;
+        }
+
+        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                      &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT noop_tmp_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        noop_tmp_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
+                /* step */ static_cast<py::ssize_t>(iter_nelems)};
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class first_reduction_axis0_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class first_custom_reduction_axis0_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        local_memory, reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+        });
+
+        size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
+                    using KernelName =
+                        class middle_reduction_axis0_temps_contig_krn<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        ReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            remaining_reduction_nelems, iter_nelems,
+                            preferred_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<resTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class middle_custom_reduction_axis0_temps_contig_krn<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            local_memory, remaining_reduction_nelems,
+                            iter_nelems, preferred_reductions_per_wi));
+                }
+            });
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_ev);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            InputIndexerT inp_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(remaining_reduction_nelems)};
+            ResIndexerT res_iter_indexer{};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        res_iter_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class final_reduction_axis0_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer,
+                                           remaining_reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class final_custom_reduction_axis0_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        temp_arg, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(final_reduction_ev);
+                const sycl::context &ctx = exec_q.get_context();
+
+                cgh.host_task([ctx, partially_reduced_tmp] {
+                    sycl::free(partially_reduced_tmp, ctx);
+                });
+            });
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+/* @brief Types supported by comparison-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForCompReductionAtomic
+{
+
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    // disjunction is C++17 feature, supported by DPC++
+    static constexpr bool is_defined = std::disjunction<
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForCompReductionTemps
+{
+
+    // disjunction is C++17 feature, supported by DPC++
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+// Sum
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-throug
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+// Product
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-throug
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForHypotReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+// Argmax and Argmin
+
+/* Sequential search reduction */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialSearchReduction(const argT *inp,
+                              outT *res,
+                              ReductionOp reduction_op,
+                              const argT &identity_val,
+                              IdxReductionOp idx_reduction_op,
+                              const outT &idx_identity_val,
+                              InputOutputIterIndexerT arg_res_iter_indexer,
+                              InputRedIndexerT arg_reduced_dims_indexer,
+                              size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), idx_reduction_op_(idx_reduction_op),
+          idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const py::ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const py::ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        argT red_val(identity_);
+        outT idx_val(idx_identity_);
+        for (size_t m = 0; m < reduction_max_gid_; ++m) {
+            const py::ssize_t inp_reduction_offset =
+                inp_reduced_dims_indexer_(m);
+            const py::ssize_t inp_offset =
+                inp_iter_offset + inp_reduction_offset;
+
+            argT val = inp_[inp_offset];
+            if (val == red_val) {
+                idx_val = idx_reduction_op_(idx_val, static_cast<outT>(m));
+            }
+            else {
+                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::less_complex;
+                        // less_complex always returns false for NaNs, so check
+                        if (less_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val < red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val < red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::greater_complex;
+                        if (greater_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val > red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val > red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+            }
+        }
+        out_[out_iter_offset] = idx_val;
+    }
+};
+
+/* = Search reduction using reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          bool First,
+          bool Last>
+struct SearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    SearchReduction(const argT *data,
+                    argT *vals,
+                    const outT *inds,
+                    outT *res,
+                    ReductionOp reduction_op,
+                    const argT &identity_val,
+                    IdxReductionOp idx_reduction_op,
+                    const outT &idx_identity_val,
+                    InputOutputIterIndexerT arg_res_iter_indexer,
+                    InputRedIndexerT arg_reduced_dims_indexer,
+                    size_t reduction_size,
+                    size_t iteration_size,
+                    size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (size_t m = 0; m < reductions_per_wi; ++m) {
+            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        if (val < local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        if (val > local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, identity_, reduction_op_);
+
+        if constexpr (std::is_integral_v<argT>) {
+            local_idx =
+                (red_val_over_wg == local_red_val) ? local_idx : idx_identity_;
+        }
+        else {
+            local_idx =
+                (red_val_over_wg == local_red_val ||
+                 std::isnan(red_val_over_wg) || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+/* = Search reduction using custom_reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT,
+          bool First,
+          bool Last>
+struct CustomSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    CustomSearchReduction(const argT *data,
+                          argT *vals,
+                          outT *inds,
+                          outT *res,
+                          ReductionOp reduction_op,
+                          const argT &identity_val,
+                          IdxReductionOp idx_reduction_op,
+                          const outT &idx_identity_val,
+                          InputOutputIterIndexerT arg_res_iter_indexer,
+                          InputRedIndexerT arg_reduced_dims_indexer,
+                          SlmT local_mem,
+                          size_t reduction_size,
+                          size_t iteration_size,
+                          size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (size_t m = 0; m < reductions_per_wi; ++m) {
+            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::less_complex;
+                            // less_complex always returns false for NaNs, so
+                            // check
+                            if (less_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
+                            if (val < local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val < local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::greater_complex;
+                            if (greater_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
+                            if (val > local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val > local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<argT>::value) {
+            // equality does not hold for NaNs, so check here
+            local_idx = (red_val_over_wg == local_red_val ||
+                         std::isnan(std::real(local_red_val)) ||
+                         std::isnan(std::imag(local_red_val)))
+                            ? local_idx
+                            : idx_identity_;
+        }
+        else if constexpr (std::is_floating_point_v<argT> ||
+                           std::is_same_v<argT, sycl::half>)
+        {
+            // equality does not hold for NaNs, so check here
+            local_idx =
+                (red_val_over_wg == local_red_val || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        else {
+            local_idx =
+                red_val_over_wg == local_red_val ? local_idx : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+typedef sycl::event (*search_strided_impl_fn_ptr)(
+    sycl::queue,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_strided_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class search_over_group_temps_strided_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class custom_search_over_group_temps_strided_krn;
+
+template <typename T1, typename T2, typename T3> class search_empty_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class single_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class first_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class middle_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class final_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class single_custom_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class first_custom_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class middle_custom_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class final_custom_search_axis0_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class single_search_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class first_search_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class middle_search_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class final_search_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class single_custom_search_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class first_custom_search_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class middle_custom_search_axis1_temps_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class final_custom_search_axis1_temps_contig_krn;
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_over_group_temps_strided_impl(
+    sycl::queue exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const py::ssize_t *iter_shape_and_strides,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    int red_nd,
+    const py::ssize_t *reduction_shape_stride,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
+    constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const py::ssize_t *const &res_shape = iter_shape_and_strides;
+            const py::ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                 res_strides);
+            using InitKernelName =
+                class search_empty_krn<resTy, argTy, ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = idx_identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            cgh.parallel_for<class search_seq_strided_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 4;
+    // max_max_wg prevents running out of resources on CPU
+    size_t max_wg =
+        std::min(size_t(2048),
+                 d.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class search_over_group_temps_strided_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_search_over_group_temps_strided_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+        });
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        resTy *partially_reduced_tmp2 = nullptr;
+
+        if (partially_reduced_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_groups * iter_nelems;
+        }
+
+        argTy *partially_reduced_vals_tmp = sycl::malloc_device<argTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        argTy *partially_reduced_vals_tmp2 = nullptr;
+
+        if (partially_reduced_vals_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_vals_tmp2 =
+                partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+        }
+
+        sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of iterated
+            // dimensions of input array from iter_shape_and_strides are going
+            // to be accessed by inp_indexer
+            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                      iter_shape_and_strides);
+            ResIndexerT noop_tmp_indexer{};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        noop_tmp_indexer};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class search_over_group_temps_strided_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_search_over_group_temps_strided_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory, reduction_nelems,
+                        iter_nelems, preferred_reductions_per_wi));
+            }
+        });
+
+        size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
+                    using KernelName =
+                        class search_over_group_temps_strided_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, false,
+                            false>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, remaining_reduction_nelems,
+                            iter_nelems, preferred_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<argTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class custom_search_over_group_temps_strided_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, SlmT,
+                            false, false>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomSearchReduction<argTy, resTy, ReductionOpT,
+                                              IndexOpT, InputOutputIterIndexerT,
+                                              ReductionIndexerT, SlmT, false,
+                                              false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, local_memory,
+                            remaining_reduction_nelems, iter_nelems,
+                            preferred_reductions_per_wi));
+                }
+            });
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_ev);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            InputIndexerT inp_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(remaining_reduction_nelems)};
+            ResIndexerT res_iter_indexer{iter_nd, iter_res_offset,
+                                         /* shape */ iter_shape_and_strides,
+                                         /* strides */ iter_shape_and_strides +
+                                             2 * iter_nd};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        res_iter_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class search_over_group_temps_strided_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, false, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, remaining_reduction_nelems,
+                        iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_search_over_group_temps_strided_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, false,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(final_reduction_ev);
+                sycl::context ctx = exec_q.get_context();
+
+                cgh.host_task(
+                    [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] {
+                        sycl::free(partially_reduced_tmp, ctx);
+                        sycl::free(partially_reduced_vals_tmp, ctx);
+                    });
+            });
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+typedef sycl::event (*search_contig_impl_fn_ptr)(
+    sycl::queue,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis1_over_group_temps_contig_impl(
+    sycl::queue exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
+    constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            cgh.parallel_for<class search_seq_contig_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 8;
+    // max_max_wg prevents running out of resources on CPU
+    size_t max_wg =
+        std::min(size_t(2048),
+                 d.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class single_search_axis1_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class single_custom_search_axis1_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+        });
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        resTy *partially_reduced_tmp2 = nullptr;
+
+        if (partially_reduced_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_groups * iter_nelems;
+        }
+
+        argTy *partially_reduced_vals_tmp = sycl::malloc_device<argTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        argTy *partially_reduced_vals_tmp2 = nullptr;
+
+        if (partially_reduced_vals_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_vals_tmp2 =
+                partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+        }
+
+        sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class first_search_axis1_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class first_custom_search_axis1_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory, reduction_nelems,
+                        iter_nelems, preferred_reductions_per_wi));
+            }
+        });
+
+        size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
+                    using KernelName =
+                        class middle_search_axis1_temps_contig_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, false,
+                            false>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, remaining_reduction_nelems,
+                            iter_nelems, preferred_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<argTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class middle_custom_search_axis1_temps_contig_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, SlmT,
+                            false, false>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomSearchReduction<argTy, resTy, ReductionOpT,
+                                              IndexOpT, InputOutputIterIndexerT,
+                                              ReductionIndexerT, SlmT, false,
+                                              false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, local_memory,
+                            remaining_reduction_nelems, iter_nelems,
+                            preferred_reductions_per_wi));
+                }
+            });
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_ev);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            InputIndexerT inp_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(remaining_reduction_nelems)};
+            ResIndexerT res_iter_indexer{};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        res_iter_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class final_search_axis1_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, false, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, remaining_reduction_nelems,
+                        iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class final_custom_search_axis1_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, false,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(final_reduction_ev);
+                sycl::context ctx = exec_q.get_context();
+
+                cgh.host_task(
+                    [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] {
+                        sycl::free(partially_reduced_tmp, ctx);
+                        sycl::free(partially_reduced_vals_tmp, ctx);
+                    });
+            });
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis0_over_group_temps_contig_impl(
+    sycl::queue exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
+    constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                        NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            using KernelName =
+                class search_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    constexpr size_t preferred_reductions_per_wi = 8;
+    // max_max_wg prevents running out of resources on CPU
+    size_t max_wg =
+        std::min(size_t(2048),
+                 d.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+    size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
+                /* step */ static_cast<py::ssize_t>(iter_nelems)};
+
+            if (iter_nelems == 1) {
+                // increase GPU occupancy
+                wg = max_wg;
+            }
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class single_search_axis0_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class single_custom_search_axis0_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+        });
         return comp_ev;
     }
     else {
         // more than one work-groups is needed, requires a temporary
         size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
         assert(reduction_groups > 1);
 
         size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
 
         resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
             iter_nelems * (reduction_groups + second_iter_reduction_groups_),
@@ -754,113 +5261,173 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
         resTy *partially_reduced_tmp2 = nullptr;
 
         if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
+            throw std::runtime_error("Unable to allocate device_memory");
         }
         else {
             partially_reduced_tmp2 =
                 partially_reduced_tmp + reduction_groups * iter_nelems;
         }
 
-        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
-                                                                      &cgh) {
+        argTy *partially_reduced_vals_tmp = sycl::malloc_device<argTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        argTy *partially_reduced_vals_tmp2 = nullptr;
+
+        if (partially_reduced_vals_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_vals_tmp2 =
+                partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+        }
+
+        sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
-            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
             using InputOutputIterIndexerT =
                 dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-
-            // Only 2*iter_nd entries describing shape and strides of iterated
-            // dimensions of input array from iter_shape_and_strides are going
-            // to be accessed by inp_indexer
-            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
-                                      iter_shape_and_strides);
-            ResIndexerT noop_tmp_indexer{};
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
 
-            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                        noop_tmp_indexer};
-            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                reduction_shape_stride};
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
+                /* step */ static_cast<py::ssize_t>(iter_nelems)};
 
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, preferrered_reductions_per_wi));
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class first_search_axis0_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, reduction_nelems, iter_nelems,
+                        preferred_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class first_custom_search_axis0_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory, reduction_nelems,
+                        iter_nelems, preferred_reductions_per_wi));
+            }
         });
 
         size_t remaining_reduction_nelems = reduction_groups;
 
         resTy *temp_arg = partially_reduced_tmp;
         resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
         sycl::event dependent_ev = first_reduction_ev;
 
         while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
+               preferred_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ = (remaining_reduction_nelems +
+                                        preferred_reductions_per_wi * wg - 1) /
+                                       (preferred_reductions_per_wi * wg);
             assert(reduction_groups_ > 1);
 
             // keep reducing
-            sycl::event partial_reduction_ev =
-                exec_q.submit([&](sycl::handler &cgh) {
-                    cgh.depends_on(dependent_ev);
-
-                    using InputIndexerT =
-                        dpctl::tensor::offset_utils::Strided1DIndexer;
-                    using ResIndexerT =
-                        dpctl::tensor::offset_utils::NoOpIndexer;
-                    using InputOutputIterIndexerT =
-                        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                            InputIndexerT, ResIndexerT>;
-                    using ReductionIndexerT =
-                        dpctl::tensor::offset_utils::NoOpIndexer;
-
-                    InputIndexerT inp_indexer{
-                        0, static_cast<py::ssize_t>(iter_nelems),
-                        static_cast<py::ssize_t>(reduction_groups_)};
-                    ResIndexerT res_iter_indexer{};
-
-                    InputOutputIterIndexerT in_out_iter_indexer{
-                        inp_indexer, res_iter_indexer};
-                    ReductionIndexerT reduction_indexer{};
-
-                    auto globalRange =
-                        sycl::range<1>{iter_nelems * reduction_groups_ * wg};
-                    auto localRange = sycl::range<1>{wg};
-
-                    using KernelName = class sum_reduction_over_group_temps_krn<
-                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                        ReductionIndexerT>;
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
+                    using KernelName =
+                        class middle_search_axis0_temps_contig_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, false,
+                            false>;
                     cgh.parallel_for<KernelName>(
                         sycl::nd_range<1>(globalRange, localRange),
-                        ReductionOverGroupNoAtomicFunctor<
-                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                            ReductionIndexerT>(
-                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
-                            in_out_iter_indexer, reduction_indexer,
+                        SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, remaining_reduction_nelems,
+                            iter_nelems, preferred_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<argTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class middle_custom_search_axis0_temps_contig_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, SlmT,
+                            false, false>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomSearchReduction<argTy, resTy, ReductionOpT,
+                                              IndexOpT, InputOutputIterIndexerT,
+                                              ReductionIndexerT, SlmT, false,
+                                              false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, local_memory,
                             remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
-                });
+                            preferred_reductions_per_wi));
+                }
+            });
 
             remaining_reduction_nelems = reduction_groups_;
             std::swap(temp_arg, temp2_arg);
-            dependent_ev = std::move(partial_reduction_ev);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
         }
 
         // final reduction to res
@@ -868,8 +5435,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
             cgh.depends_on(dependent_ev);
 
             using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
             using InputOutputIterIndexerT =
                 dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
                     InputIndexerT, ResIndexerT>;
@@ -878,10 +5444,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
             InputIndexerT inp_indexer{
                 0, static_cast<py::ssize_t>(iter_nelems),
                 static_cast<py::ssize_t>(remaining_reduction_nelems)};
-            ResIndexerT res_iter_indexer{iter_nd, iter_res_offset,
-                                         /* shape */ iter_shape_and_strides,
-                                         /*s trides */ iter_shape_and_strides +
-                                             2 * iter_nd};
+            ResIndexerT res_iter_indexer{};
 
             InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
                                                         res_iter_indexer};
@@ -900,28 +5463,54 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<resTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    temp_arg, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer,
-                    remaining_reduction_nelems, iter_nelems,
-                    reductions_per_wi));
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class final_search_axis0_temps_contig_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, false, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, remaining_reduction_nelems,
+                        iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class final_custom_search_axis0_temps_contig_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, false,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
         });
 
         sycl::event cleanup_host_task_event =
             exec_q.submit([&](sycl::handler &cgh) {
                 cgh.depends_on(final_reduction_ev);
-                const sycl::context &ctx = exec_q.get_context();
+                sycl::context ctx = exec_q.get_context();
 
-                cgh.host_task([ctx, partially_reduced_tmp] {
-                    sycl::free(partially_reduced_tmp, ctx);
-                });
+                cgh.host_task(
+                    [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] {
+                        sycl::free(partially_reduced_tmp, ctx);
+                        sycl::free(partially_reduced_vals_tmp, ctx);
+                    });
             });
 
         // FIXME: do not return host-task event
@@ -931,183 +5520,88 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
     }
 }
 
-/* @brief Types supported by plus-reduction code based on atomic_ref */
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionAtomic
-{
-
-    /* value if true a kernel for <argTy, outTy> must be instantiated, false
-     * otherwise */
-    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
-                                                         // feature, supported
-                                                         // by DPC++ input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-        // input int8
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-        // input uint8
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-        // input int16
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-        // input uint16
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
-        // input int32
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
-        // input uint32
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
-        // input int64
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
-        // input uint64
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
 template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionTemps
+struct TypePairSupportDataForSearchReductionTemps
 {
 
     static constexpr bool is_defined = std::disjunction< // disjunction is C++17
                                                          // feature, supported
                                                          // by DPC++ input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-
         // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
 
         // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
 
         // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
 
         // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
 
         // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-
         // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
 
         // input int64_t
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
 
         // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
 
         // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-        td_ns::
-            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    sycl::half,
-                                    outTy,
-                                    std::complex<double>>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
 
         // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
 
         // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
 
         // input std::complex
         td_ns::TypePairDefinedEntry<argTy,
                                     std::complex<float>,
                                     outTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<double>>,
+                                    std::int64_t>,
 
         td_ns::TypePairDefinedEntry<argTy,
                                     std::complex<double>,
                                     outTy,
-                                    std::complex<double>>,
+                                    std::int64_t>,
 
-        // fall-throug
+        // fall-through
         td_ns::NotDefinedEntry>::is_defined;
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisAtomicStridedFactory
+struct ArgmaxOverAxisTempsStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
                           srcTy, dstTy>::is_defined)
         {
-            return dpctl::tensor::kernels::
-                sum_reduction_over_group_with_atomics_strided_impl<srcTy,
-                                                                   dstTy>;
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
         }
         else {
             return nullptr;
@@ -1116,14 +5610,32 @@ struct SumOverAxisAtomicStridedFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisTempsStridedFactory
+struct ArgmaxOverAxis1TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionTemps<
-                          srcTy, dstTy>::is_defined) {
-            return dpctl::tensor::kernels::
-                sum_reduction_over_group_temps_strided_impl<srcTy, dstTy>;
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
         }
         else {
             return nullptr;
@@ -1132,16 +5644,32 @@ struct SumOverAxisTempsStridedFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis1AtomicContigFactory
+struct ArgmaxOverAxis0TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
                           srcTy, dstTy>::is_defined)
         {
-            return dpctl::tensor::kernels::
-                sum_reduction_axis1_over_group_with_atomics_contig_impl<srcTy,
-                                                                        dstTy>;
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
         }
         else {
             return nullptr;
@@ -1150,16 +5678,100 @@ struct SumOverAxis1AtomicContigFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis0AtomicContigFactory
+struct ArgminOverAxisTempsStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
                           srcTy, dstTy>::is_defined)
         {
-            return dpctl::tensor::kernels::
-                sum_reduction_axis0_over_group_with_atomics_contig_impl<srcTy,
-                                                                        dstTy>;
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
         }
         else {
             return nullptr;
diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
index da1989fc3c..1f2335fc6c 100644
--- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
@@ -46,14 +46,16 @@ namespace py = pybind11;
 using namespace dpctl::tensor::offset_utils;
 
 template <typename OrthogIndexer,
-          typename AxisIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
           typename RepIndexer,
           typename T,
           typename repT>
 class repeat_by_sequence_kernel;
 
 template <typename OrthogIndexer,
-          typename AxisIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
           typename RepIndexer,
           typename T,
           typename repT>
@@ -66,8 +68,8 @@ class RepeatSequenceFunctor
     const repT *cumsum = nullptr;
     size_t src_axis_nelems = 1;
     OrthogIndexer orthog_strider;
-    AxisIndexer src_axis_strider;
-    AxisIndexer dst_axis_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
     RepIndexer reps_strider;
 
 public:
@@ -77,8 +79,8 @@ class RepeatSequenceFunctor
                           const repT *cumsum_,
                           size_t src_axis_nelems_,
                           OrthogIndexer orthog_strider_,
-                          AxisIndexer src_axis_strider_,
-                          AxisIndexer dst_axis_strider_,
+                          SrcAxisIndexer src_axis_strider_,
+                          DstAxisIndexer dst_axis_strider_,
                           RepIndexer reps_strider_)
         : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_),
           src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_),
@@ -167,12 +169,12 @@ repeat_by_sequence_impl(sycl::queue &q,
 
         const size_t gws = orthog_nelems * src_axis_nelems;
 
-        cgh.parallel_for<repeat_by_sequence_kernel<TwoOffsets_StridedIndexer,
-                                                   Strided1DIndexer,
-                                                   Strided1DIndexer, T, repT>>(
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
             sycl::range<1>(gws),
             RepeatSequenceFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
-                                  Strided1DIndexer, T, repT>(
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
                 src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems,
                 orthog_indexer, src_axis_indexer, dst_axis_indexer,
                 reps_indexer));
@@ -197,8 +199,8 @@ typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)(
     char *,
     const char *,
     const char *,
-    py::ssize_t,
-    py::ssize_t,
+    int,
+    const py::ssize_t *,
     py::ssize_t,
     py::ssize_t,
     py::ssize_t,
@@ -212,8 +214,8 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q,
                                        char *dst_cp,
                                        const char *reps_cp,
                                        const char *cumsum_cp,
-                                       py::ssize_t src_shape,
-                                       py::ssize_t src_stride,
+                                       int src_nd,
+                                       const py::ssize_t *src_shape_strides,
                                        py::ssize_t dst_shape,
                                        py::ssize_t dst_stride,
                                        py::ssize_t reps_shape,
@@ -231,19 +233,19 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q,
         // orthog ndim indexer
         TwoZeroOffsets_Indexer orthog_indexer{};
         // indexers along repeated axis
-        Strided1DIndexer src_indexer{0, src_shape, src_stride};
+        StridedIndexer src_indexer{src_nd, 0, src_shape_strides};
         Strided1DIndexer dst_indexer{0, dst_shape, dst_stride};
         // indexer along reps array
         Strided1DIndexer reps_indexer{0, reps_shape, reps_stride};
 
         const size_t gws = src_nelems;
 
-        cgh.parallel_for<
-            repeat_by_sequence_kernel<TwoZeroOffsets_Indexer, Strided1DIndexer,
-                                      Strided1DIndexer, T, repT>>(
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
             sycl::range<1>(gws),
-            RepeatSequenceFunctor<TwoZeroOffsets_Indexer, Strided1DIndexer,
-                                  Strided1DIndexer, T, repT>(
+            RepeatSequenceFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
                 src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer,
                 src_indexer, dst_indexer, reps_indexer));
     });
@@ -260,10 +262,16 @@ template <typename fnT, typename T> struct RepeatSequence1DFactory
     }
 };
 
-template <typename OrthogIndexer, typename AxisIndexer, typename T>
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
 class repeat_by_scalar_kernel;
 
-template <typename OrthogIndexer, typename AxisIndexer, typename T>
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
 class RepeatScalarFunctor
 {
 private:
@@ -272,8 +280,8 @@ class RepeatScalarFunctor
     const py::ssize_t reps = 1;
     size_t dst_axis_nelems = 0;
     OrthogIndexer orthog_strider;
-    AxisIndexer src_axis_strider;
-    AxisIndexer dst_axis_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
 
 public:
     RepeatScalarFunctor(const T *src_,
@@ -281,8 +289,8 @@ class RepeatScalarFunctor
                         const py::ssize_t reps_,
                         size_t dst_axis_nelems_,
                         OrthogIndexer orthog_strider_,
-                        AxisIndexer src_axis_strider_,
-                        AxisIndexer dst_axis_strider_)
+                        SrcAxisIndexer src_axis_strider_,
+                        DstAxisIndexer dst_axis_strider_)
         : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_),
           orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_),
           dst_axis_strider(dst_axis_strider_)
@@ -354,10 +362,11 @@ sycl::event repeat_by_scalar_impl(sycl::queue &q,
 
         const size_t gws = orthog_nelems * dst_axis_nelems;
 
-        cgh.parallel_for<repeat_by_scalar_kernel<TwoOffsets_StridedIndexer,
-                                                 Strided1DIndexer, T>>(
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer, T>>(
             sycl::range<1>(gws),
-            RepeatScalarFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer, T>(
+            RepeatScalarFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                Strided1DIndexer, T>(
                 src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer,
                 src_axis_indexer, dst_axis_indexer));
     });
@@ -380,8 +389,8 @@ typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)(
     const char *,
     char *,
     const py::ssize_t,
-    py::ssize_t,
-    py::ssize_t,
+    int,
+    const py::ssize_t *,
     py::ssize_t,
     py::ssize_t,
     const std::vector<sycl::event> &);
@@ -392,8 +401,8 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q,
                                      const char *src_cp,
                                      char *dst_cp,
                                      const py::ssize_t reps,
-                                     py::ssize_t src_shape,
-                                     py::ssize_t src_stride,
+                                     int src_nd,
+                                     const py::ssize_t *src_shape_strides,
                                      py::ssize_t dst_shape,
                                      py::ssize_t dst_stride,
                                      const std::vector<sycl::event> &depends)
@@ -407,17 +416,18 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q,
         // orthog ndim indexer
         TwoZeroOffsets_Indexer orthog_indexer{};
         // indexers along repeated axis
-        Strided1DIndexer src_indexer(0, src_shape, src_stride);
+        StridedIndexer src_indexer(src_nd, 0, src_shape_strides);
         Strided1DIndexer dst_indexer{0, dst_shape, dst_stride};
 
         const size_t gws = dst_nelems;
 
-        cgh.parallel_for<repeat_by_scalar_kernel<TwoZeroOffsets_Indexer,
-                                                 Strided1DIndexer, T>>(
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, T>>(
             sycl::range<1>(gws),
-            RepeatScalarFunctor<TwoZeroOffsets_Indexer, Strided1DIndexer, T>(
-                src_tp, dst_tp, reps, dst_nelems, orthog_indexer, src_indexer,
-                dst_indexer));
+            RepeatScalarFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                Strided1DIndexer, T>(src_tp, dst_tp, reps,
+                                                     dst_nelems, orthog_indexer,
+                                                     src_indexer, dst_indexer));
     });
 
     return repeat_ev;
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
index d724e03e35..120a14d536 100644
--- a/dpctl/tensor/libtensor/include/utils/math_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
@@ -115,6 +115,26 @@ template <typename T> T min_complex(const T &x1, const T &x2)
     return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2;
 }
 
+template <typename T> T logaddexp(T x, T y)
+{
+    if (x == y) { // handle signed infinities
+        const T log2 = std::log(T(2));
+        return x + log2;
+    }
+    else {
+        const T tmp = x - y;
+        if (tmp > 0) {
+            return x + std::log1p(std::exp(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + std::log1p(std::exp(tmp));
+        }
+        else {
+            return std::numeric_limits<T>::quiet_NaN();
+        }
+    }
+}
+
 } // namespace math_utils
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 2fc7b02efa..c0165b0ecc 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -26,14 +26,79 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cstddef>
+#include <type_traits>
 #include <vector>
 
+#include "math_utils.hpp"
+
 namespace dpctl
 {
 namespace tensor
 {
 namespace sycl_utils
 {
+namespace detail
+{
+
+template <typename...> struct TypeList;
+
+template <typename Head, typename... Tail> struct TypeList<Head, Tail...>
+{
+    using head = Head;
+    using tail = TypeList<Tail...>;
+};
+
+using NullTypeList = TypeList<>;
+template <typename T>
+struct IsNullTypeList : std::conditional_t<std::is_same_v<T, NullTypeList>,
+                                           std::true_type,
+                                           std::false_type>
+{
+};
+
+// recursively check if type is contained in given TypeList
+template <typename T, typename TList>
+struct IsContained
+    : std::conditional_t<
+          std::is_same_v<typename TList::head, std::remove_cv_t<T>>,
+          std::true_type,
+          IsContained<T, typename TList::tail>>
+{
+};
+
+template <> struct TypeList<>
+{
+};
+
+// std::false_type when last case has been checked for membership
+template <typename T> struct IsContained<T, NullTypeList> : std::false_type
+{
+};
+
+template <class T> struct IsComplex : std::false_type
+{
+};
+template <class T> struct IsComplex<std::complex<T>> : std::true_type
+{
+};
+
+} // namespace detail
+
+template <typename T>
+using sycl_ops = detail::TypeList<sycl::plus<T>,
+                                  sycl::bit_or<T>,
+                                  sycl::bit_xor<T>,
+                                  sycl::bit_and<T>,
+                                  sycl::maximum<T>,
+                                  sycl::minimum<T>,
+                                  sycl::multiplies<T>>;
+
+template <typename T, typename Op> struct IsSyclOp
+{
+    static constexpr bool value =
+        detail::IsContained<Op, sycl_ops<std::remove_const_t<T>>>::value ||
+        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value;
+};
 
 /*! @brief Find the smallest multiple of supported sub-group size larger than
  * nelems */
@@ -66,6 +131,223 @@ size_t choose_workgroup_size(const size_t nelems,
     return wg;
 }
 
+template <typename T, typename GroupT, typename LocAccT, typename OpT>
+T custom_reduce_over_group(const GroupT &wg,
+                           LocAccT local_mem_acc,
+                           const T &local_val,
+                           const OpT &op)
+{
+    size_t wgs = wg.get_local_linear_range();
+    local_mem_acc[wg.get_local_linear_id()] = local_val;
+
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    T red_val_over_wg = local_mem_acc[0];
+    if (wg.leader()) {
+        for (size_t i = 1; i < wgs; ++i) {
+            red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]);
+        }
+    }
+
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    return sycl::group_broadcast(wg, red_val_over_wg);
+}
+
+// Reduction functors
+
+// Maximum
+
+template <typename T> struct Maximum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x > y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x || y;
+        }
+        else {
+            return (x > y) ? x : y;
+        }
+    }
+};
+
+// Minimum
+
+template <typename T> struct Minimum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x < y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x && y;
+        }
+        else {
+            return (x < y) ? x : y;
+        }
+    }
+};
+
+// Define identities and operator checking structs
+
+template <typename Op, typename T, typename = void> struct GetIdentity
+{
+};
+
+// Maximum
+
+template <typename T, class Op>
+using IsMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>> ||
+                                     std::is_same_v<Op, Maximum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMaximum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(-std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::lowest());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMaximum<bool, Op>::value>>
+{
+    static constexpr bool value = false;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMaximum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{-std::numeric_limits<T>::infinity(),
+                                           -std::numeric_limits<T>::infinity()};
+};
+
+// Minimum
+
+template <typename T, class Op>
+using IsMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>> ||
+                                     std::is_same_v<Op, Minimum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMinimum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::max());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMinimum<bool, Op>::value>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMinimum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{std::numeric_limits<T>::infinity(),
+                                           std::numeric_limits<T>::infinity()};
+};
+
+// Plus
+
+template <typename T, class Op>
+using IsPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>> ||
+                                  std::is_same_v<Op, std::plus<T>>>;
+// Multiplies
+
+template <typename T, class Op>
+using IsMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>> ||
+                       std::is_same_v<Op, std::multiplies<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMultiplies<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
+
+// LogSumExp
+
+template <typename T> struct LogSumExp
+{
+    T operator()(const T &x, const T &y) const
+    {
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<T>(x, y);
+    }
+};
+
+template <typename T, class Op>
+using IsLogSumExp = std::bool_constant<std::is_same_v<Op, LogSumExp<T>>>;
+
+// only defined for types with infinity
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogSumExp<T, Op>::value>>
+{
+    static constexpr T value = -std::numeric_limits<T>::infinity();
+};
+
+// Hypot
+
+template <typename T> struct Hypot
+{
+    T operator()(const T &x, const T &y) const
+    {
+        return sycl::hypot(x, y);
+    }
+};
+
+template <typename T, class Op>
+using IsHypot = std::bool_constant<std::is_same_v<Op, Hypot<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsHypot<T, Op>::value>>
+{
+    static constexpr T value = 0;
+};
+
+// Identity
+
+template <typename Op, typename T, typename = void> struct Identity
+{
+};
+
+template <typename Op, typename T>
+using UseBuiltInIdentity =
+    std::conjunction<IsSyclOp<T, Op>, sycl::has_known_identity<Op, T>>;
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<!UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = GetIdentity<Op, T>::value;
+};
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = sycl::known_identity<Op, T>::value;
+};
+
 } // namespace sycl_utils
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp
new file mode 100644
index 0000000000..ac494c19ae
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/clip.cpp
@@ -0,0 +1,269 @@
+//===-- clip.cpp - Implementation of clip  --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <complex>
+#include <cstdint>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+
+#include "clip.hpp"
+#include "kernels/clip.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t;
+
+static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types];
+static clip_strided_impl_fn_ptr_t
+    clip_strided_dispatch_vector[td_ns::num_types];
+
+void init_clip_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::clip::ClipContigFactory;
+    DispatchVectorBuilder<clip_contig_impl_fn_ptr_t, ClipContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(clip_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::clip::ClipStridedFactory;
+    DispatchVectorBuilder<clip_strided_impl_fn_ptr_t, ClipStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(clip_strided_dispatch_vector);
+}
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+py_clip(const dpctl::tensor::usm_ndarray &src,
+        const dpctl::tensor::usm_ndarray &min,
+        const dpctl::tensor::usm_ndarray &max,
+        const dpctl::tensor::usm_ndarray &dst,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    int nd = src.get_ndim();
+    int min_nd = min.get_ndim();
+    int max_nd = max.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != min_nd || nd != max_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for clip kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for clip kernel.");
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *min_shape = min.get_shape_raw();
+    const py::ssize_t *max_shape = max.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<size_t>(sh_i);
+        shapes_equal = shapes_equal && (min_shape[i] == sh_i) &&
+                       (max_shape[i] == sh_i) && (src_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Arrays are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
+        (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
+        (overlap(dst, max) && !same_logical_tensors(dst, max)))
+    {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int min_typenum = min.get_typenum();
+    int max_typenum = max.get_typenum();
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int min_typeid = array_types.typenum_to_lookup_id(min_typenum);
+    int max_typeid = array_types.typenum_to_lookup_id(max_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid || src_typeid != min_typeid ||
+        src_typeid != max_typeid)
+    {
+        throw py::value_error("Input, min, max, and destination arrays must "
+                              "have the same data type");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accommodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(nelems)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accommodate all the "
+                "array elements.");
+        }
+    }
+
+    char *src_data = src.get_data();
+    char *min_data = min.get_data();
+    char *max_data = max.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_min_c_contig = min.is_c_contiguous();
+    bool is_min_f_contig = min.is_f_contiguous();
+
+    bool is_max_c_contig = max.is_c_contiguous();
+    bool is_max_f_contig = max.is_f_contiguous();
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_min_c_contig && is_max_c_contig &&
+                         is_src_c_contig && is_dst_c_contig);
+    bool all_f_contig = (is_min_f_contig && is_max_f_contig &&
+                         is_src_f_contig && is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = clip_contig_dispatch_vector[src_typeid];
+
+        sycl::event clip_ev =
+            fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev});
+
+        return std::make_pair(ht_ev, clip_ev);
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &min_strides = min.get_strides_vector();
+    auto const &max_strides = max.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_min_strides;
+    shT simplified_max_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t min_offset(0);
+    py::ssize_t max_offset(0);
+    py::ssize_t dst_offset(0);
+
+    dpctl::tensor::py_internal::simplify_iteration_space_4(
+        nd, src_shape, src_strides, min_strides, max_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides, src_offset, min_offset,
+        max_offset, dst_offset);
+
+    auto fn = clip_strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides);
+    py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple);
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data,
+                             dst_data, packed_shape_strides, src_offset,
+                             min_offset, max_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(clip_ev);
+        const auto &ctx = exec_q.get_context();
+        cgh.host_task([packed_shape_strides, ctx]() {
+            sycl::free(packed_shape_strides, ctx);
+        });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, min, max, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, clip_ev);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/clip.hpp b/dpctl/tensor/libtensor/source/clip.hpp
new file mode 100644
index 0000000000..d4b8af2cf5
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/clip.hpp
@@ -0,0 +1,52 @@
+//===--                      clip.hpp -                       --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.clip
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+py_clip(const dpctl::tensor::usm_ndarray &src,
+        const dpctl::tensor::usm_ndarray &min,
+        const dpctl::tensor::usm_ndarray &max,
+        const dpctl::tensor::usm_ndarray &dst,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends);
+
+extern void init_clip_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
deleted file mode 100644
index cca0ac7c0a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ /dev/null
@@ -1,4784 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2023 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <utility>
-
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/abs.hpp"
-#include "kernels/elementwise_functions/acos.hpp"
-#include "kernels/elementwise_functions/acosh.hpp"
-#include "kernels/elementwise_functions/add.hpp"
-#include "kernels/elementwise_functions/asin.hpp"
-#include "kernels/elementwise_functions/asinh.hpp"
-#include "kernels/elementwise_functions/atan.hpp"
-#include "kernels/elementwise_functions/atan2.hpp"
-#include "kernels/elementwise_functions/atanh.hpp"
-#include "kernels/elementwise_functions/bitwise_and.hpp"
-#include "kernels/elementwise_functions/bitwise_invert.hpp"
-#include "kernels/elementwise_functions/bitwise_left_shift.hpp"
-#include "kernels/elementwise_functions/bitwise_or.hpp"
-#include "kernels/elementwise_functions/bitwise_right_shift.hpp"
-#include "kernels/elementwise_functions/bitwise_xor.hpp"
-#include "kernels/elementwise_functions/ceil.hpp"
-#include "kernels/elementwise_functions/conj.hpp"
-#include "kernels/elementwise_functions/cos.hpp"
-#include "kernels/elementwise_functions/cosh.hpp"
-#include "kernels/elementwise_functions/equal.hpp"
-#include "kernels/elementwise_functions/exp.hpp"
-#include "kernels/elementwise_functions/expm1.hpp"
-#include "kernels/elementwise_functions/floor.hpp"
-#include "kernels/elementwise_functions/floor_divide.hpp"
-#include "kernels/elementwise_functions/greater.hpp"
-#include "kernels/elementwise_functions/greater_equal.hpp"
-#include "kernels/elementwise_functions/hypot.hpp"
-#include "kernels/elementwise_functions/imag.hpp"
-#include "kernels/elementwise_functions/isfinite.hpp"
-#include "kernels/elementwise_functions/isinf.hpp"
-#include "kernels/elementwise_functions/isnan.hpp"
-#include "kernels/elementwise_functions/less.hpp"
-#include "kernels/elementwise_functions/less_equal.hpp"
-#include "kernels/elementwise_functions/log.hpp"
-#include "kernels/elementwise_functions/log10.hpp"
-#include "kernels/elementwise_functions/log1p.hpp"
-#include "kernels/elementwise_functions/log2.hpp"
-#include "kernels/elementwise_functions/logaddexp.hpp"
-#include "kernels/elementwise_functions/logical_and.hpp"
-#include "kernels/elementwise_functions/logical_not.hpp"
-#include "kernels/elementwise_functions/logical_or.hpp"
-#include "kernels/elementwise_functions/logical_xor.hpp"
-#include "kernels/elementwise_functions/maximum.hpp"
-#include "kernels/elementwise_functions/minimum.hpp"
-#include "kernels/elementwise_functions/multiply.hpp"
-#include "kernels/elementwise_functions/negative.hpp"
-#include "kernels/elementwise_functions/not_equal.hpp"
-#include "kernels/elementwise_functions/positive.hpp"
-#include "kernels/elementwise_functions/pow.hpp"
-#include "kernels/elementwise_functions/proj.hpp"
-#include "kernels/elementwise_functions/real.hpp"
-#include "kernels/elementwise_functions/remainder.hpp"
-#include "kernels/elementwise_functions/round.hpp"
-#include "kernels/elementwise_functions/sign.hpp"
-#include "kernels/elementwise_functions/signbit.hpp"
-#include "kernels/elementwise_functions/sin.hpp"
-#include "kernels/elementwise_functions/sinh.hpp"
-#include "kernels/elementwise_functions/sqrt.hpp"
-#include "kernels/elementwise_functions/square.hpp"
-#include "kernels/elementwise_functions/subtract.hpp"
-#include "kernels/elementwise_functions/tan.hpp"
-#include "kernels/elementwise_functions/tanh.hpp"
-#include "kernels/elementwise_functions/true_divide.hpp"
-#include "kernels/elementwise_functions/trunc.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t)
-{
-    switch (dst_typenum_t) {
-    case td_ns::typenum_t::BOOL:
-        return py::dtype("?");
-    case td_ns::typenum_t::INT8:
-        return py::dtype("i1");
-    case td_ns::typenum_t::UINT8:
-        return py::dtype("u1");
-    case td_ns::typenum_t::INT16:
-        return py::dtype("i2");
-    case td_ns::typenum_t::UINT16:
-        return py::dtype("u2");
-    case td_ns::typenum_t::INT32:
-        return py::dtype("i4");
-    case td_ns::typenum_t::UINT32:
-        return py::dtype("u4");
-    case td_ns::typenum_t::INT64:
-        return py::dtype("i8");
-    case td_ns::typenum_t::UINT64:
-        return py::dtype("u8");
-    case td_ns::typenum_t::HALF:
-        return py::dtype("f2");
-    case td_ns::typenum_t::FLOAT:
-        return py::dtype("f4");
-    case td_ns::typenum_t::DOUBLE:
-        return py::dtype("f8");
-    case td_ns::typenum_t::CFLOAT:
-        return py::dtype("c8");
-    case td_ns::typenum_t::CDOUBLE:
-        return py::dtype("c16");
-    default:
-        throw py::value_error("Unrecognized dst_typeid");
-    }
-}
-
-int _result_typeid(int arg_typeid, const int *fn_output_id)
-{
-    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) {
-        throw py::value_error("Input typeid " + std::to_string(arg_typeid) +
-                              " is outside of expected bounds.");
-    }
-
-    return fn_output_id[arg_typeid];
-}
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// U01: ==== ABS   (x)
-namespace impl
-{
-
-namespace abs_fn_ns = dpctl::tensor::kernels::abs;
-
-static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types];
-static int abs_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    abs_strided_dispatch_vector[td_ns::num_types];
-
-void populate_abs_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = abs_fn_ns;
-
-    using fn_ns::AbsContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AbsContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(abs_contig_dispatch_vector);
-
-    using fn_ns::AbsStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AbsStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(abs_strided_dispatch_vector);
-
-    using fn_ns::AbsTypeMapFactory;
-    DispatchVectorBuilder<int, AbsTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(abs_output_typeid_vector);
-};
-
-} // namespace impl
-
-// U02: ==== ACOS   (x)
-namespace impl
-{
-
-namespace acos_fn_ns = dpctl::tensor::kernels::acos;
-
-static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types];
-static int acos_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    acos_strided_dispatch_vector[td_ns::num_types];
-
-void populate_acos_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = acos_fn_ns;
-
-    using fn_ns::AcosContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcosContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(acos_contig_dispatch_vector);
-
-    using fn_ns::AcosStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcosStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(acos_strided_dispatch_vector);
-
-    using fn_ns::AcosTypeMapFactory;
-    DispatchVectorBuilder<int, AcosTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(acos_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U03: ===== ACOSH (x)
-namespace impl
-{
-
-namespace acosh_fn_ns = dpctl::tensor::kernels::acosh;
-
-static unary_contig_impl_fn_ptr_t
-    acosh_contig_dispatch_vector[td_ns::num_types];
-static int acosh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    acosh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_acosh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = acosh_fn_ns;
-
-    using fn_ns::AcoshContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcoshContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector);
-
-    using fn_ns::AcoshStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcoshStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector);
-
-    using fn_ns::AcoshTypeMapFactory;
-    DispatchVectorBuilder<int, AcoshTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(acosh_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B01: ===== ADD   (x1, x2)
-namespace impl
-{
-namespace add_fn_ns = dpctl::tensor::kernels::add;
-
-static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types]
-                                                            [td_ns::num_types];
-static int add_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    add_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// add(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types]
-                                                         [td_ns::num_types];
-
-// add(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types]
-                                                         [td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
-    add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_add_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = add_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::AddTypeMapFactory;
-    DispatchTableBuilder<int, AddTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(add_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::AddStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, AddStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(add_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::AddContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, AddContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(add_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::AddContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        AddContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        add_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::AddContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        AddContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        add_contig_row_contig_matrix_broadcast_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::AddInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         AddInplaceStridedFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::AddInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         AddInplaceContigFactory, num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table);
-
-    // function pointers for inplace operation on contiguous matrix
-    // and contiguous row
-    using fn_ns::AddInplaceRowMatrixBroadcastFactory;
-    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
-                         AddInplaceRowMatrixBroadcastFactory, num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table);
-};
-
-} // namespace impl
-
-// U04: ===== ASIN  (x)
-namespace impl
-{
-
-namespace asin_fn_ns = dpctl::tensor::kernels::asin;
-
-static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types];
-static int asin_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    asin_strided_dispatch_vector[td_ns::num_types];
-
-void populate_asin_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = asin_fn_ns;
-
-    using fn_ns::AsinContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(asin_contig_dispatch_vector);
-
-    using fn_ns::AsinStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(asin_strided_dispatch_vector);
-
-    using fn_ns::AsinTypeMapFactory;
-    DispatchVectorBuilder<int, AsinTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(asin_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U05: ===== ASINH (x)
-namespace impl
-{
-
-namespace asinh_fn_ns = dpctl::tensor::kernels::asinh;
-
-static unary_contig_impl_fn_ptr_t
-    asinh_contig_dispatch_vector[td_ns::num_types];
-static int asinh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    asinh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_asinh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = asinh_fn_ns;
-
-    using fn_ns::AsinhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector);
-
-    using fn_ns::AsinhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector);
-
-    using fn_ns::AsinhTypeMapFactory;
-    DispatchVectorBuilder<int, AsinhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(asinh_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U06: ===== ATAN  (x)
-namespace impl
-{
-
-namespace atan_fn_ns = dpctl::tensor::kernels::atan;
-
-static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types];
-static int atan_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    atan_strided_dispatch_vector[td_ns::num_types];
-
-void populate_atan_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = atan_fn_ns;
-
-    using fn_ns::AtanContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(atan_contig_dispatch_vector);
-
-    using fn_ns::AtanStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(atan_strided_dispatch_vector);
-
-    using fn_ns::AtanTypeMapFactory;
-    DispatchVectorBuilder<int, AtanTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(atan_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B02: ===== ATAN2 (x1, x2)
-namespace impl
-{
-namespace atan2_fn_ns = dpctl::tensor::kernels::atan2;
-
-static binary_contig_impl_fn_ptr_t
-    atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int atan2_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_atan2_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = atan2_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::Atan2TypeMapFactory;
-    DispatchTableBuilder<int, Atan2TypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(atan2_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::Atan2StridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, Atan2StridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(atan2_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::Atan2ContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, Atan2ContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(atan2_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// U07: ===== ATANH (x)
-namespace impl
-{
-
-namespace atanh_fn_ns = dpctl::tensor::kernels::atanh;
-
-static unary_contig_impl_fn_ptr_t
-    atanh_contig_dispatch_vector[td_ns::num_types];
-static int atanh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    atanh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_atanh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = atanh_fn_ns;
-
-    using fn_ns::AtanhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector);
-
-    using fn_ns::AtanhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector);
-
-    using fn_ns::AtanhTypeMapFactory;
-    DispatchVectorBuilder<int, AtanhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(atanh_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B03: ===== BITWISE_AND           (x1, x2)
-namespace impl
-{
-namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_bitwise_and_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_and_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseAndTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseAndTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_and_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseAndStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseAndStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseAndContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseAndContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
-namespace impl
-{
-namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_left_shift_contig_dispatch_table[td_ns::num_types]
-                                            [td_ns::num_types];
-static int bitwise_left_shift_output_id_table[td_ns::num_types]
-                                             [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_left_shift_strided_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-
-void populate_bitwise_left_shift_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_left_shift_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseLeftShiftTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseLeftShiftTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseLeftShiftStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         BitwiseLeftShiftStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseLeftShiftContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
-                         BitwiseLeftShiftContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// U08: ===== BITWISE_INVERT        (x)
-namespace impl
-{
-
-namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert;
-
-static unary_contig_impl_fn_ptr_t
-    bitwise_invert_contig_dispatch_vector[td_ns::num_types];
-static int bitwise_invert_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    bitwise_invert_strided_dispatch_vector[td_ns::num_types];
-
-void populate_bitwise_invert_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_invert_fn_ns;
-
-    using fn_ns::BitwiseInvertContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t,
-                          BitwiseInvertContigFactory, num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector);
-
-    using fn_ns::BitwiseInvertStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t,
-                          BitwiseInvertStridedFactory, num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector);
-
-    using fn_ns::BitwiseInvertTypeMapFactory;
-    DispatchVectorBuilder<int, BitwiseInvertTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector);
-};
-
-} // namespace impl
-
-// B05: ===== BITWISE_OR            (x1, x2)
-namespace impl
-{
-namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_bitwise_or_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_or_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseOrTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseOrTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_or_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseOrStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseOrStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseOrContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseOrContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table);
-};
-} // namespace impl
-
-// B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
-namespace impl
-{
-namespace bitwise_right_shift_fn_ns =
-    dpctl::tensor::kernels::bitwise_right_shift;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_right_shift_contig_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-static int bitwise_right_shift_output_id_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_right_shift_strided_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-void populate_bitwise_right_shift_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_right_shift_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseRightShiftTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseRightShiftTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseRightShiftStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         BitwiseRightShiftStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseRightShiftContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
-                         BitwiseRightShiftContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// B07: ===== BITWISE_XOR           (x1, x2)
-namespace impl
-{
-namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_bitwise_xor_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_xor_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseXorTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseXorTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_xor_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseXorStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseXorStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseXorContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseXorContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table);
-};
-} // namespace impl
-
-// U09: ==== CEIL          (x)
-namespace impl
-{
-
-namespace ceil_fn_ns = dpctl::tensor::kernels::ceil;
-
-static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types];
-static int ceil_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    ceil_strided_dispatch_vector[td_ns::num_types];
-
-void populate_ceil_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = ceil_fn_ns;
-
-    using fn_ns::CeilContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CeilContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector);
-
-    using fn_ns::CeilStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CeilStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector);
-
-    using fn_ns::CeilTypeMapFactory;
-    DispatchVectorBuilder<int, CeilTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(ceil_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U10: ==== CONJ          (x)
-namespace impl
-{
-
-namespace conj_fn_ns = dpctl::tensor::kernels::conj;
-
-static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types];
-static int conj_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    conj_strided_dispatch_vector[td_ns::num_types];
-
-void populate_conj_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = conj_fn_ns;
-
-    using fn_ns::ConjContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ConjContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(conj_contig_dispatch_vector);
-
-    using fn_ns::ConjStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ConjStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(conj_strided_dispatch_vector);
-
-    using fn_ns::ConjTypeMapFactory;
-    DispatchVectorBuilder<int, ConjTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(conj_output_typeid_vector);
-}
-} // namespace impl
-
-// U11: ==== COS           (x)
-namespace impl
-{
-
-namespace cos_fn_ns = dpctl::tensor::kernels::cos;
-
-static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types];
-static int cos_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    cos_strided_dispatch_vector[td_ns::num_types];
-
-void populate_cos_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = cos_fn_ns;
-
-    using fn_ns::CosContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CosContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(cos_contig_dispatch_vector);
-
-    using fn_ns::CosStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CosStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(cos_strided_dispatch_vector);
-
-    using fn_ns::CosTypeMapFactory;
-    DispatchVectorBuilder<int, CosTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(cos_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U12: ==== COSH          (x)
-namespace impl
-{
-
-namespace cosh_fn_ns = dpctl::tensor::kernels::cosh;
-
-static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types];
-static int cosh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    cosh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_cosh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = cosh_fn_ns;
-
-    using fn_ns::CoshContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CoshContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector);
-
-    using fn_ns::CoshStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CoshStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector);
-
-    using fn_ns::CoshTypeMapFactory;
-    DispatchVectorBuilder<int, CoshTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(cosh_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B08: ==== DIVIDE        (x1, x2)
-namespace impl
-{
-namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide;
-
-static binary_contig_impl_fn_ptr_t
-    true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// divide(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    true_divide_contig_matrix_contig_row_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-// divide(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    true_divide_contig_row_contig_matrix_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-void populate_true_divide_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = true_divide_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::TrueDivideTypeMapFactory;
-    DispatchTableBuilder<int, TrueDivideTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(true_divide_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::TrueDivideStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, TrueDivideStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(true_divide_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::TrueDivideContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, TrueDivideContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(true_divide_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        TrueDivideContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        true_divide_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        TrueDivideContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
-};
-
-} // namespace impl
-
-// B09: ==== EQUAL         (x1, x2)
-namespace impl
-{
-namespace equal_fn_ns = dpctl::tensor::kernels::equal;
-
-static binary_contig_impl_fn_ptr_t
-    equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::EqualTypeMapFactory;
-    DispatchTableBuilder<int, EqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::EqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, EqualStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::EqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, EqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(equal_contig_dispatch_table);
-};
-} // namespace impl
-
-// U13: ==== EXP           (x)
-namespace impl
-{
-
-namespace exp_fn_ns = dpctl::tensor::kernels::exp;
-
-static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types];
-static int exp_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    exp_strided_dispatch_vector[td_ns::num_types];
-
-void populate_exp_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = exp_fn_ns;
-
-    using fn_ns::ExpContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ExpContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(exp_contig_dispatch_vector);
-
-    using fn_ns::ExpStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ExpStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(exp_strided_dispatch_vector);
-
-    using fn_ns::ExpTypeMapFactory;
-    DispatchVectorBuilder<int, ExpTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(exp_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U14: ==== EXPM1         (x)
-namespace impl
-{
-
-namespace expm1_fn_ns = dpctl::tensor::kernels::expm1;
-
-static unary_contig_impl_fn_ptr_t
-    expm1_contig_dispatch_vector[td_ns::num_types];
-static int expm1_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    expm1_strided_dispatch_vector[td_ns::num_types];
-
-void populate_expm1_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = expm1_fn_ns;
-
-    using fn_ns::Expm1ContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Expm1ContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector);
-
-    using fn_ns::Expm1StridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Expm1StridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector);
-
-    using fn_ns::Expm1TypeMapFactory;
-    DispatchVectorBuilder<int, Expm1TypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(expm1_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U15: ==== FLOOR         (x)
-namespace impl
-{
-
-namespace floor_fn_ns = dpctl::tensor::kernels::floor;
-
-static unary_contig_impl_fn_ptr_t
-    floor_contig_dispatch_vector[td_ns::num_types];
-static int floor_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    floor_strided_dispatch_vector[td_ns::num_types];
-
-void populate_floor_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = floor_fn_ns;
-
-    using fn_ns::FloorContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, FloorContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(floor_contig_dispatch_vector);
-
-    using fn_ns::FloorStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, FloorStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(floor_strided_dispatch_vector);
-
-    using fn_ns::FloorTypeMapFactory;
-    DispatchVectorBuilder<int, FloorTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(floor_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B10: ==== FLOOR_DIVIDE  (x1, x2)
-namespace impl
-{
-namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide;
-
-static binary_contig_impl_fn_ptr_t
-    floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_floor_divide_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = floor_divide_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::FloorDivideTypeMapFactory;
-    DispatchTableBuilder<int, FloorDivideTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(floor_divide_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::FloorDivideStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         FloorDivideStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::FloorDivideContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, FloorDivideContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// B11: ==== GREATER       (x1, x2)
-namespace impl
-{
-namespace greater_fn_ns = dpctl::tensor::kernels::greater;
-
-static binary_contig_impl_fn_ptr_t
-    greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int greater_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_greater_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = greater_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::GreaterTypeMapFactory;
-    DispatchTableBuilder<int, GreaterTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(greater_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::GreaterStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, GreaterStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(greater_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::GreaterContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(greater_contig_dispatch_table);
-};
-} // namespace impl
-
-// B12: ==== GREATER_EQUAL (x1, x2)
-namespace impl
-{
-namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal;
-
-static binary_contig_impl_fn_ptr_t
-    greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_greater_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = greater_equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::GreaterEqualTypeMapFactory;
-    DispatchTableBuilder<int, GreaterEqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(greater_equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::GreaterEqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         GreaterEqualStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::GreaterEqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterEqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table);
-};
-} // namespace impl
-
-// U16: ==== IMAG        (x)
-namespace impl
-{
-
-namespace imag_fn_ns = dpctl::tensor::kernels::imag;
-
-static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types];
-static int imag_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    imag_strided_dispatch_vector[td_ns::num_types];
-
-void populate_imag_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = imag_fn_ns;
-
-    using fn_ns::ImagContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ImagContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(imag_contig_dispatch_vector);
-
-    using fn_ns::ImagStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ImagStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(imag_strided_dispatch_vector);
-
-    using fn_ns::ImagTypeMapFactory;
-    DispatchVectorBuilder<int, ImagTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(imag_output_typeid_vector);
-}
-} // namespace impl
-
-// U17: ==== ISFINITE    (x)
-namespace impl
-{
-namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite;
-
-static unary_contig_impl_fn_ptr_t
-    isfinite_contig_dispatch_vector[td_ns::num_types];
-static int isfinite_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    isfinite_strided_dispatch_vector[td_ns::num_types];
-
-void populate_isfinite_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = isfinite_fn_ns;
-
-    using fn_ns::IsFiniteContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsFiniteContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector);
-
-    using fn_ns::IsFiniteStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsFiniteStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector);
-
-    using fn_ns::IsFiniteTypeMapFactory;
-    DispatchVectorBuilder<int, IsFiniteTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(isfinite_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U18: ==== ISINF       (x)
-namespace impl
-{
-namespace isinf_fn_ns = dpctl::tensor::kernels::isinf;
-
-static unary_contig_impl_fn_ptr_t
-    isinf_contig_dispatch_vector[td_ns::num_types];
-static int isinf_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    isinf_strided_dispatch_vector[td_ns::num_types];
-
-void populate_isinf_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = isinf_fn_ns;
-
-    using fn_ns::IsInfContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsInfContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector);
-
-    using fn_ns::IsInfStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsInfStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector);
-
-    using fn_ns::IsInfTypeMapFactory;
-    DispatchVectorBuilder<int, IsInfTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(isinf_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U19: ==== ISNAN       (x)
-namespace impl
-{
-namespace isnan_fn_ns = dpctl::tensor::kernels::isnan;
-
-static unary_contig_impl_fn_ptr_t
-    isnan_contig_dispatch_vector[td_ns::num_types];
-static int isnan_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    isnan_strided_dispatch_vector[td_ns::num_types];
-
-void populate_isnan_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = isnan_fn_ns;
-
-    using fn_ns::IsNanContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsNanContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector);
-
-    using fn_ns::IsNanStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsNanStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector);
-
-    using fn_ns::IsNanTypeMapFactory;
-    DispatchVectorBuilder<int, IsNanTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(isnan_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B13: ==== LESS        (x1, x2)
-namespace impl
-{
-namespace less_fn_ns = dpctl::tensor::kernels::less;
-
-static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types]
-                                                             [td_ns::num_types];
-static int less_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    less_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_less_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = less_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LessTypeMapFactory;
-    DispatchTableBuilder<int, LessTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(less_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LessStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(less_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LessContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(less_contig_dispatch_table);
-};
-} // namespace impl
-
-// B14: ==== LESS_EQUAL  (x1, x2)
-namespace impl
-{
-namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal;
-
-static binary_contig_impl_fn_ptr_t
-    less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_less_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = less_equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LessEqualTypeMapFactory;
-    DispatchTableBuilder<int, LessEqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(less_equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LessEqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessEqualStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(less_equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LessEqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessEqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(less_equal_contig_dispatch_table);
-};
-} // namespace impl
-
-// U20: ==== LOG         (x)
-namespace impl
-{
-
-namespace log_fn_ns = dpctl::tensor::kernels::log;
-
-static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types];
-static int log_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log_fn_ns;
-
-    using fn_ns::LogContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log_contig_dispatch_vector);
-
-    using fn_ns::LogStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log_strided_dispatch_vector);
-
-    using fn_ns::LogTypeMapFactory;
-    DispatchVectorBuilder<int, LogTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U21: ==== LOG1P       (x)
-namespace impl
-{
-
-namespace log1p_fn_ns = dpctl::tensor::kernels::log1p;
-
-static unary_contig_impl_fn_ptr_t
-    log1p_contig_dispatch_vector[td_ns::num_types];
-static int log1p_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log1p_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log1p_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log1p_fn_ns;
-
-    using fn_ns::Log1pContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log1pContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector);
-
-    using fn_ns::Log1pStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log1pStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector);
-
-    using fn_ns::Log1pTypeMapFactory;
-    DispatchVectorBuilder<int, Log1pTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log1p_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U22: ==== LOG2        (x)
-namespace impl
-{
-
-namespace log2_fn_ns = dpctl::tensor::kernels::log2;
-
-static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types];
-static int log2_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log2_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log2_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log2_fn_ns;
-
-    using fn_ns::Log2ContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log2ContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log2_contig_dispatch_vector);
-
-    using fn_ns::Log2StridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log2StridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log2_strided_dispatch_vector);
-
-    using fn_ns::Log2TypeMapFactory;
-    DispatchVectorBuilder<int, Log2TypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log2_output_typeid_vector);
-};
-
-} // namespace impl
-
-// U23: ==== LOG10       (x)
-namespace impl
-{
-
-namespace log10_fn_ns = dpctl::tensor::kernels::log10;
-
-static unary_contig_impl_fn_ptr_t
-    log10_contig_dispatch_vector[td_ns::num_types];
-static int log10_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log10_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log10_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log10_fn_ns;
-
-    using fn_ns::Log10ContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log10ContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log10_contig_dispatch_vector);
-
-    using fn_ns::Log10StridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log10StridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log10_strided_dispatch_vector);
-
-    using fn_ns::Log10TypeMapFactory;
-    DispatchVectorBuilder<int, Log10TypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log10_output_typeid_vector);
-};
-
-} // namespace impl
-
-// B15: ==== LOGADDEXP   (x1, x2)
-namespace impl
-{
-namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp;
-
-static binary_contig_impl_fn_ptr_t
-    logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logaddexp_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logaddexp_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogAddExpTypeMapFactory;
-    DispatchTableBuilder<int, LogAddExpTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logaddexp_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogAddExpStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogAddExpStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogAddExpContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogAddExpContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table);
-};
-} // namespace impl
-
-// B16: ==== LOGICAL_AND (x1, x2)
-namespace impl
-{
-namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and;
-
-static binary_contig_impl_fn_ptr_t
-    logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logical_and_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_and_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogicalAndTypeMapFactory;
-    DispatchTableBuilder<int, LogicalAndTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logical_and_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogicalAndStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalAndStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logical_and_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogicalAndContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalAndContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logical_and_contig_dispatch_table);
-};
-} // namespace impl
-
-// U24: ==== LOGICAL_NOT (x)
-namespace impl
-{
-namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not;
-
-static unary_contig_impl_fn_ptr_t
-    logical_not_contig_dispatch_vector[td_ns::num_types];
-static int logical_not_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    logical_not_strided_dispatch_vector[td_ns::num_types];
-
-void populate_logical_not_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_not_fn_ns;
-
-    using fn_ns::LogicalNotContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogicalNotContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector);
-
-    using fn_ns::LogicalNotStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogicalNotStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector);
-
-    using fn_ns::LogicalNotTypeMapFactory;
-    DispatchVectorBuilder<int, LogicalNotTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(logical_not_output_typeid_vector);
-};
-} // namespace impl
-
-// B17: ==== LOGICAL_OR  (x1, x2)
-namespace impl
-{
-namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or;
-
-static binary_contig_impl_fn_ptr_t
-    logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logical_or_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_or_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogicalOrTypeMapFactory;
-    DispatchTableBuilder<int, LogicalOrTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logical_or_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogicalOrStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalOrStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logical_or_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogicalOrContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalOrContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logical_or_contig_dispatch_table);
-};
-} // namespace impl
-
-// B18: ==== LOGICAL_XOR (x1, x2)
-namespace impl
-{
-namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor;
-
-static binary_contig_impl_fn_ptr_t
-    logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logical_xor_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_xor_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogicalXorTypeMapFactory;
-    DispatchTableBuilder<int, LogicalXorTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logical_xor_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogicalXorStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalXorStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogicalXorContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalXorContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table);
-};
-} // namespace impl
-
-// B??: ==== MAXIMUM    (x1, x2)
-namespace impl
-{
-
-namespace maximum_fn_ns = dpctl::tensor::kernels::maximum;
-
-static binary_contig_impl_fn_ptr_t
-    maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int maximum_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_maximum_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = maximum_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::MaximumTypeMapFactory;
-    DispatchTableBuilder<int, MaximumTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(maximum_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::MaximumStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MaximumStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(maximum_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::MaximumContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MaximumContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(maximum_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// B??: ==== MINIMUM    (x1, x2)
-namespace impl
-{
-
-namespace minimum_fn_ns = dpctl::tensor::kernels::minimum;
-
-static binary_contig_impl_fn_ptr_t
-    minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int minimum_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_minimum_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = minimum_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::MinimumTypeMapFactory;
-    DispatchTableBuilder<int, MinimumTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(minimum_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::MinimumStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MinimumStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(minimum_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::MinimumContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MinimumContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(minimum_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// B19: ==== MULTIPLY    (x1, x2)
-namespace impl
-{
-
-namespace multiply_fn_ns = dpctl::tensor::kernels::multiply;
-
-static binary_contig_impl_fn_ptr_t
-    multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int multiply_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// mul(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    multiply_contig_matrix_contig_row_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-// mul(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    multiply_contig_row_contig_matrix_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
-    multiply_inplace_row_matrix_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-void populate_multiply_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = multiply_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::MultiplyTypeMapFactory;
-    DispatchTableBuilder<int, MultiplyTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(multiply_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::MultiplyStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MultiplyStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(multiply_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::MultiplyContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MultiplyContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(multiply_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        MultiplyContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        multiply_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        MultiplyContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        multiply_contig_row_contig_matrix_broadcast_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::MultiplyInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         MultiplyInplaceStridedFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::MultiplyInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         MultiplyInplaceContigFactory, num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table);
-
-    // function pointers for inplace operation on contiguous matrix
-    // and contiguous row
-    using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory;
-    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
-                         MultiplyInplaceRowMatrixBroadcastFactory, num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table);
-};
-
-} // namespace impl
-
-// U25: ==== NEGATIVE    (x)
-namespace impl
-{
-
-namespace negative_fn_ns = dpctl::tensor::kernels::negative;
-
-static unary_contig_impl_fn_ptr_t
-    negative_contig_dispatch_vector[td_ns::num_types];
-static int negative_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    negative_strided_dispatch_vector[td_ns::num_types];
-
-void populate_negative_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = negative_fn_ns;
-
-    using fn_ns::NegativeContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, NegativeContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(negative_contig_dispatch_vector);
-
-    using fn_ns::NegativeStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, NegativeStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(negative_strided_dispatch_vector);
-
-    using fn_ns::NegativeTypeMapFactory;
-    DispatchVectorBuilder<int, NegativeTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(negative_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B20: ==== NOT_EQUAL   (x1, x2)
-namespace impl
-{
-namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal;
-
-static binary_contig_impl_fn_ptr_t
-    not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_not_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = not_equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::NotEqualTypeMapFactory;
-    DispatchTableBuilder<int, NotEqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(not_equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::NotEqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NotEqualStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(not_equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::NotEqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NotEqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(not_equal_contig_dispatch_table);
-};
-} // namespace impl
-
-// U26: ==== POSITIVE    (x)
-namespace impl
-{
-
-namespace positive_fn_ns = dpctl::tensor::kernels::positive;
-
-static unary_contig_impl_fn_ptr_t
-    positive_contig_dispatch_vector[td_ns::num_types];
-static int positive_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    positive_strided_dispatch_vector[td_ns::num_types];
-
-void populate_positive_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = positive_fn_ns;
-
-    using fn_ns::PositiveContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, PositiveContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(positive_contig_dispatch_vector);
-
-    using fn_ns::PositiveStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, PositiveStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(positive_strided_dispatch_vector);
-
-    using fn_ns::PositiveTypeMapFactory;
-    DispatchVectorBuilder<int, PositiveTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(positive_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B21: ==== POW         (x1, x2)
-namespace impl
-{
-
-namespace pow_fn_ns = dpctl::tensor::kernels::pow;
-
-static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types]
-                                                            [td_ns::num_types];
-static int pow_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_pow_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = pow_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::PowTypeMapFactory;
-    DispatchTableBuilder<int, PowTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(pow_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::PowStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, PowStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(pow_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::PowContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, PowContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(pow_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// U??: ==== PROJ        (x)
-namespace impl
-{
-
-namespace proj_fn_ns = dpctl::tensor::kernels::proj;
-
-static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types];
-static int proj_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    proj_strided_dispatch_vector[td_ns::num_types];
-
-void populate_proj_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = proj_fn_ns;
-
-    using fn_ns::ProjContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ProjContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(proj_contig_dispatch_vector);
-
-    using fn_ns::ProjStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ProjStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(proj_strided_dispatch_vector);
-
-    using fn_ns::ProjTypeMapFactory;
-    DispatchVectorBuilder<int, ProjTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(proj_output_typeid_vector);
-}
-} // namespace impl
-
-// U27: ==== REAL        (x)
-namespace impl
-{
-
-namespace real_fn_ns = dpctl::tensor::kernels::real;
-
-static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types];
-static int real_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    real_strided_dispatch_vector[td_ns::num_types];
-
-void populate_real_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = real_fn_ns;
-
-    using fn_ns::RealContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RealContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(real_contig_dispatch_vector);
-
-    using fn_ns::RealStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RealStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(real_strided_dispatch_vector);
-
-    using fn_ns::RealTypeMapFactory;
-    DispatchVectorBuilder<int, RealTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(real_output_typeid_vector);
-}
-} // namespace impl
-
-// B22: ==== REMAINDER   (x1, x2)
-namespace impl
-{
-
-namespace remainder_fn_ns = dpctl::tensor::kernels::remainder;
-
-static binary_contig_impl_fn_ptr_t
-    remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int remainder_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_remainder_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = remainder_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::RemainderTypeMapFactory;
-    DispatchTableBuilder<int, RemainderTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(remainder_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::RemainderStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, RemainderStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(remainder_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::RemainderContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, RemainderContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(remainder_contig_dispatch_table);
-}
-
-} // namespace impl
-
-// U28: ==== ROUND       (x)
-namespace impl
-{
-
-namespace round_fn_ns = dpctl::tensor::kernels::round;
-
-static unary_contig_impl_fn_ptr_t
-    round_contig_dispatch_vector[td_ns::num_types];
-static int round_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    round_strided_dispatch_vector[td_ns::num_types];
-
-void populate_round_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = round_fn_ns;
-
-    using fn_ns::RoundContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RoundContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(round_contig_dispatch_vector);
-
-    using fn_ns::RoundStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RoundStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(round_strided_dispatch_vector);
-
-    using fn_ns::RoundTypeMapFactory;
-    DispatchVectorBuilder<int, RoundTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(round_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U29: ==== SIGN        (x)
-namespace impl
-{
-
-namespace sign_fn_ns = dpctl::tensor::kernels::sign;
-
-static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types];
-static int sign_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sign_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sign_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sign_fn_ns;
-
-    using fn_ns::SignContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sign_contig_dispatch_vector);
-
-    using fn_ns::SignStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sign_strided_dispatch_vector);
-
-    using fn_ns::SignTypeMapFactory;
-    DispatchVectorBuilder<int, SignTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sign_output_typeid_vector);
-}
-
-} // namespace impl
-
-// ==== SIGNBIT        (x)
-namespace impl
-{
-
-namespace signbit_fn_ns = dpctl::tensor::kernels::signbit;
-
-static unary_contig_impl_fn_ptr_t
-    signbit_contig_dispatch_vector[td_ns::num_types];
-static int signbit_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    signbit_strided_dispatch_vector[td_ns::num_types];
-
-void populate_signbit_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = signbit_fn_ns;
-
-    using fn_ns::SignbitContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignbitContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector);
-
-    using fn_ns::SignbitStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignbitStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector);
-
-    using fn_ns::SignbitTypeMapFactory;
-    DispatchVectorBuilder<int, SignbitTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(signbit_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U30: ==== SIN         (x)
-namespace impl
-{
-
-namespace sin_fn_ns = dpctl::tensor::kernels::sin;
-
-static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types];
-static int sin_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sin_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sin_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sin_fn_ns;
-
-    using fn_ns::SinContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sin_contig_dispatch_vector);
-
-    using fn_ns::SinStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sin_strided_dispatch_vector);
-
-    using fn_ns::SinTypeMapFactory;
-    DispatchVectorBuilder<int, SinTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sin_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U31: ==== SINH        (x)
-namespace impl
-{
-
-namespace sinh_fn_ns = dpctl::tensor::kernels::sinh;
-
-static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types];
-static int sinh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sinh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sinh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sinh_fn_ns;
-
-    using fn_ns::SinhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector);
-
-    using fn_ns::SinhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector);
-
-    using fn_ns::SinhTypeMapFactory;
-    DispatchVectorBuilder<int, SinhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sinh_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U32: ==== SQUARE      (x)
-namespace impl
-{
-
-namespace square_fn_ns = dpctl::tensor::kernels::square;
-
-static unary_contig_impl_fn_ptr_t
-    square_contig_dispatch_vector[td_ns::num_types];
-static int square_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    square_strided_dispatch_vector[td_ns::num_types];
-
-void populate_square_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = square_fn_ns;
-
-    using fn_ns::SquareContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SquareContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(square_contig_dispatch_vector);
-
-    using fn_ns::SquareStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SquareStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(square_strided_dispatch_vector);
-
-    using fn_ns::SquareTypeMapFactory;
-    DispatchVectorBuilder<int, SquareTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(square_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U33: ==== SQRT        (x)
-namespace impl
-{
-
-namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt;
-
-static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types];
-static int sqrt_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sqrt_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sqrt_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sqrt_fn_ns;
-
-    using fn_ns::SqrtContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SqrtContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector);
-
-    using fn_ns::SqrtStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SqrtStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector);
-
-    using fn_ns::SqrtTypeMapFactory;
-    DispatchVectorBuilder<int, SqrtTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sqrt_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B23: ==== SUBTRACT    (x1, x2)
-namespace impl
-{
-namespace subtract_fn_ns = dpctl::tensor::kernels::subtract;
-
-static binary_contig_impl_fn_ptr_t
-    subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int subtract_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// sub(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    subtract_contig_matrix_contig_row_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-// sub(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    subtract_contig_row_contig_matrix_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
-    subtract_inplace_row_matrix_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-void populate_subtract_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = subtract_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::SubtractTypeMapFactory;
-    DispatchTableBuilder<int, SubtractTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(subtract_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::SubtractStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, SubtractStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(subtract_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::SubtractContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, SubtractContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(subtract_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::SubtractContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        SubtractContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        subtract_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::SubtractContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        SubtractContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        subtract_contig_row_contig_matrix_broadcast_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::SubtractInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         SubtractInplaceStridedFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::SubtractInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         SubtractInplaceContigFactory, num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table);
-
-    // function pointers for inplace operation on contiguous matrix
-    // and contiguous row
-    using fn_ns::SubtractInplaceRowMatrixBroadcastFactory;
-    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
-                         SubtractInplaceRowMatrixBroadcastFactory, num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table);
-};
-
-} // namespace impl
-
-// U34: ==== TAN         (x)
-namespace impl
-{
-
-namespace tan_fn_ns = dpctl::tensor::kernels::tan;
-
-static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types];
-static int tan_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    tan_strided_dispatch_vector[td_ns::num_types];
-
-void populate_tan_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = tan_fn_ns;
-
-    using fn_ns::TanContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(tan_contig_dispatch_vector);
-
-    using fn_ns::TanStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(tan_strided_dispatch_vector);
-
-    using fn_ns::TanTypeMapFactory;
-    DispatchVectorBuilder<int, TanTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(tan_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U35: ==== TANH        (x)
-namespace impl
-{
-
-namespace tanh_fn_ns = dpctl::tensor::kernels::tanh;
-
-static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types];
-static int tanh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    tanh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_tanh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = tanh_fn_ns;
-
-    using fn_ns::TanhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector);
-
-    using fn_ns::TanhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector);
-
-    using fn_ns::TanhTypeMapFactory;
-    DispatchVectorBuilder<int, TanhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(tanh_output_typeid_vector);
-}
-
-} // namespace impl
-
-// U36: ==== TRUNC       (x)
-namespace impl
-{
-
-namespace trunc_fn_ns = dpctl::tensor::kernels::trunc;
-
-static unary_contig_impl_fn_ptr_t
-    trunc_contig_dispatch_vector[td_ns::num_types];
-static int trunc_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    trunc_strided_dispatch_vector[td_ns::num_types];
-
-void populate_trunc_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = trunc_fn_ns;
-
-    using fn_ns::TruncContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TruncContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector);
-
-    using fn_ns::TruncStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TruncStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector);
-
-    using fn_ns::TruncTypeMapFactory;
-    DispatchVectorBuilder<int, TruncTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(trunc_output_typeid_vector);
-}
-
-} // namespace impl
-
-// B24:  ==== HYPOT    (x1, x2)
-
-namespace impl
-{
-namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
-
-static binary_contig_impl_fn_ptr_t
-    hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int hypot_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_hypot_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = hypot_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::HypotTypeMapFactory;
-    DispatchTableBuilder<int, HypotTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(hypot_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::HypotStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, HypotStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(hypot_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::HypotContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, HypotContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(hypot_contig_dispatch_table);
-};
-
-} // namespace impl
-
-// ==========================================================================================
-// //
-
-namespace py = pybind11;
-
-void init_elementwise_functions(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-
-    // U01: ==== ABS   (x)
-    {
-        impl::populate_abs_dispatch_vectors();
-        using impl::abs_contig_dispatch_vector;
-        using impl::abs_output_typeid_vector;
-        using impl::abs_strided_dispatch_vector;
-
-        auto abs_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, abs_output_typeid_vector,
-                abs_contig_dispatch_vector, abs_strided_dispatch_vector);
-        };
-        m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto abs_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector);
-        };
-        m.def("_abs_result_type", abs_result_type_pyapi);
-    }
-
-    // U02: ==== ACOS   (x)
-    {
-        impl::populate_acos_dispatch_vectors();
-        using impl::acos_contig_dispatch_vector;
-        using impl::acos_output_typeid_vector;
-        using impl::acos_strided_dispatch_vector;
-
-        auto acos_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, acos_output_typeid_vector,
-                acos_contig_dispatch_vector, acos_strided_dispatch_vector);
-        };
-        m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto acos_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector);
-        };
-        m.def("_acos_result_type", acos_result_type_pyapi);
-    }
-
-    // U03: ===== ACOSH (x)
-    {
-        impl::populate_acosh_dispatch_vectors();
-        using impl::acosh_contig_dispatch_vector;
-        using impl::acosh_output_typeid_vector;
-        using impl::acosh_strided_dispatch_vector;
-
-        auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, acosh_output_typeid_vector,
-                acosh_contig_dispatch_vector, acosh_strided_dispatch_vector);
-        };
-        m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto acosh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              acosh_output_typeid_vector);
-        };
-        m.def("_acosh_result_type", acosh_result_type_pyapi);
-    }
-
-    // B01: ===== ADD   (x1, x2)
-    {
-        impl::populate_add_dispatch_tables();
-        using impl::add_contig_dispatch_table;
-        using impl::add_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::add_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::add_output_id_table;
-        using impl::add_strided_dispatch_table;
-
-        auto add_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                             const dpctl::tensor::usm_ndarray &src2,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, add_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                add_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                add_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                add_contig_matrix_contig_row_broadcast_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                add_contig_row_contig_matrix_broadcast_dispatch_table);
-        };
-        auto add_result_type_pyapi = [&](const py::dtype &dtype1,
-                                         const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               add_output_id_table);
-        };
-        m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_add_result_type", add_result_type_pyapi, "");
-
-        using impl::add_inplace_contig_dispatch_table;
-        using impl::add_inplace_row_matrix_dispatch_table;
-        using impl::add_inplace_strided_dispatch_table;
-
-        auto add_inplace_pyapi =
-            [&](const dpctl::tensor::usm_ndarray &src,
-                const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends = {}) {
-                return py_binary_inplace_ufunc(
-                    src, dst, exec_q, depends, add_output_id_table,
-                    // function pointers to handle inplace operation on
-                    // contiguous arrays (pointers may be nullptr)
-                    add_inplace_contig_dispatch_table,
-                    // function pointers to handle inplace operation on strided
-                    // arrays (most general case)
-                    add_inplace_strided_dispatch_table,
-                    // function pointers to handle inplace operation on
-                    // c-contig matrix with c-contig row with broadcasting
-                    // (may be nullptr)
-                    add_inplace_row_matrix_dispatch_table);
-            };
-        m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-
-    // U04: ===== ASIN  (x)
-    {
-        impl::populate_asin_dispatch_vectors();
-        using impl::asin_contig_dispatch_vector;
-        using impl::asin_output_typeid_vector;
-        using impl::asin_strided_dispatch_vector;
-
-        auto asin_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, asin_output_typeid_vector,
-                asin_contig_dispatch_vector, asin_strided_dispatch_vector);
-        };
-        m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto asin_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector);
-        };
-        m.def("_asin_result_type", asin_result_type_pyapi);
-    }
-
-    // U05: ===== ASINH (x)
-    {
-        impl::populate_asinh_dispatch_vectors();
-        using impl::asinh_contig_dispatch_vector;
-        using impl::asinh_output_typeid_vector;
-        using impl::asinh_strided_dispatch_vector;
-
-        auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, asinh_output_typeid_vector,
-                asinh_contig_dispatch_vector, asinh_strided_dispatch_vector);
-        };
-        m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto asinh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              asinh_output_typeid_vector);
-        };
-        m.def("_asinh_result_type", asinh_result_type_pyapi);
-    }
-
-    // U06: ===== ATAN  (x)
-    {
-        impl::populate_atan_dispatch_vectors();
-        using impl::atan_contig_dispatch_vector;
-        using impl::atan_output_typeid_vector;
-        using impl::atan_strided_dispatch_vector;
-
-        auto atan_pyapi = [&](arrayT src, arrayT dst, sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, atan_output_typeid_vector,
-                atan_contig_dispatch_vector, atan_strided_dispatch_vector);
-        };
-        m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto atan_result_type_pyapi = [&](py::dtype dtype) {
-            return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector);
-        };
-        m.def("_atan_result_type", atan_result_type_pyapi);
-    }
-
-    // B02: ===== ATAN2 (x1, x2)
-    {
-        impl::populate_atan2_dispatch_tables();
-        using impl::atan2_contig_dispatch_table;
-        using impl::atan2_output_id_table;
-        using impl::atan2_strided_dispatch_table;
-
-        auto atan2_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                               const dpctl::tensor::usm_ndarray &src2,
-                               const dpctl::tensor::usm_ndarray &dst,
-                               sycl::queue &exec_q,
-                               const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, atan2_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                atan2_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                atan2_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto atan2_result_type_pyapi = [&](const py::dtype &dtype1,
-                                           const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               atan2_output_id_table);
-        };
-        m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_atan2_result_type", atan2_result_type_pyapi, "");
-    }
-
-    // U07: ===== ATANH (x)
-    {
-        impl::populate_atanh_dispatch_vectors();
-        using impl::atanh_contig_dispatch_vector;
-        using impl::atanh_output_typeid_vector;
-        using impl::atanh_strided_dispatch_vector;
-
-        auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, atanh_output_typeid_vector,
-                atanh_contig_dispatch_vector, atanh_strided_dispatch_vector);
-        };
-        m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto atanh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              atanh_output_typeid_vector);
-        };
-        m.def("_atanh_result_type", atanh_result_type_pyapi);
-    }
-
-    // B03: ===== BITWISE_AND           (x1, x2)
-    {
-        impl::populate_bitwise_and_dispatch_tables();
-        using impl::bitwise_and_contig_dispatch_table;
-        using impl::bitwise_and_output_id_table;
-        using impl::bitwise_and_strided_dispatch_table;
-
-        auto bitwise_and_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                     const dpctl::tensor::usm_ndarray &src2,
-                                     const dpctl::tensor::usm_ndarray &dst,
-                                     sycl::queue &exec_q,
-                                     const std::vector<sycl::event> &depends =
-                                         {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, bitwise_and_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_and_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_and_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               bitwise_and_output_id_table);
-        };
-        m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, "");
-    }
-
-    // B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
-    {
-        impl::populate_bitwise_left_shift_dispatch_tables();
-        using impl::bitwise_left_shift_contig_dispatch_table;
-        using impl::bitwise_left_shift_output_id_table;
-        using impl::bitwise_left_shift_strided_dispatch_table;
-
-        auto bitwise_left_shift_pyapi = [&](const dpctl::tensor::usm_ndarray
-                                                &src1,
-                                            const dpctl::tensor::usm_ndarray
-                                                &src2,
-                                            const dpctl::tensor::usm_ndarray
-                                                &dst,
-                                            sycl::queue &exec_q,
-                                            const std::vector<sycl::event>
-                                                &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends,
-                bitwise_left_shift_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_left_shift_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_left_shift_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_left_shift_result_type_pyapi =
-            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
-                return py_binary_ufunc_result_type(
-                    dtype1, dtype2, bitwise_left_shift_output_id_table);
-            };
-        m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "",
-              py::arg("src1"), py::arg("src2"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_bitwise_left_shift_result_type",
-              bitwise_left_shift_result_type_pyapi, "");
-    }
-
-    // U08: ===== BITWISE_INVERT        (x)
-    {
-        impl::populate_bitwise_invert_dispatch_vectors();
-        using impl::bitwise_invert_contig_dispatch_vector;
-        using impl::bitwise_invert_output_typeid_vector;
-        using impl::bitwise_invert_strided_dispatch_vector;
-
-        auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                        sycl::queue &exec_q,
-                                        const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  bitwise_invert_output_typeid_vector,
-                                  bitwise_invert_contig_dispatch_vector,
-                                  bitwise_invert_strided_dispatch_vector);
-        };
-        m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-
-        auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(
-                dtype, bitwise_invert_output_typeid_vector);
-        };
-        m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi);
-    }
-
-    // B05: ===== BITWISE_OR            (x1, x2)
-    {
-        impl::populate_bitwise_or_dispatch_tables();
-        using impl::bitwise_or_contig_dispatch_table;
-        using impl::bitwise_or_output_id_table;
-        using impl::bitwise_or_strided_dispatch_table;
-
-        auto bitwise_or_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                    const dpctl::tensor::usm_ndarray &src2,
-                                    const dpctl::tensor::usm_ndarray &dst,
-                                    sycl::queue &exec_q,
-                                    const std::vector<sycl::event> &depends =
-                                        {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, bitwise_or_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_or_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_or_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               bitwise_or_output_id_table);
-        };
-        m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, "");
-    }
-
-    // B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
-    {
-        impl::populate_bitwise_right_shift_dispatch_tables();
-        using impl::bitwise_right_shift_contig_dispatch_table;
-        using impl::bitwise_right_shift_output_id_table;
-        using impl::bitwise_right_shift_strided_dispatch_table;
-
-        auto bitwise_right_shift_pyapi = [&](const dpctl::tensor::usm_ndarray
-                                                 &src1,
-                                             const dpctl::tensor::usm_ndarray
-                                                 &src2,
-                                             const dpctl::tensor::usm_ndarray
-                                                 &dst,
-                                             sycl::queue &exec_q,
-                                             const std::vector<sycl::event>
-                                                 &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends,
-                bitwise_right_shift_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_right_shift_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_right_shift_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_right_shift_result_type_pyapi =
-            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
-                return py_binary_ufunc_result_type(
-                    dtype1, dtype2, bitwise_right_shift_output_id_table);
-            };
-        m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "",
-              py::arg("src1"), py::arg("src2"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_bitwise_right_shift_result_type",
-              bitwise_right_shift_result_type_pyapi, "");
-    }
-
-    // B07: ===== BITWISE_XOR           (x1, x2)
-    {
-        impl::populate_bitwise_xor_dispatch_tables();
-        using impl::bitwise_xor_contig_dispatch_table;
-        using impl::bitwise_xor_output_id_table;
-        using impl::bitwise_xor_strided_dispatch_table;
-
-        auto bitwise_xor_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                     const dpctl::tensor::usm_ndarray &src2,
-                                     const dpctl::tensor::usm_ndarray &dst,
-                                     sycl::queue &exec_q,
-                                     const std::vector<sycl::event> &depends =
-                                         {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_xor_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_xor_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               bitwise_xor_output_id_table);
-        };
-        m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, "");
-    }
-
-    // U09: ==== CEIL          (x)
-    {
-        impl::populate_ceil_dispatch_vectors();
-        using impl::ceil_contig_dispatch_vector;
-        using impl::ceil_output_typeid_vector;
-        using impl::ceil_strided_dispatch_vector;
-
-        auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, ceil_output_typeid_vector,
-                ceil_contig_dispatch_vector, ceil_strided_dispatch_vector);
-        };
-        m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto ceil_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector);
-        };
-        m.def("_ceil_result_type", ceil_result_type_pyapi);
-    }
-
-    // U10: ==== CONJ          (x)
-    {
-        impl::populate_conj_dispatch_vectors();
-        using impl::conj_contig_dispatch_vector;
-        using impl::conj_output_typeid_vector;
-        using impl::conj_strided_dispatch_vector;
-
-        auto conj_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, conj_output_typeid_vector,
-                conj_contig_dispatch_vector, conj_strided_dispatch_vector);
-        };
-        m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto conj_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector);
-        };
-        m.def("_conj_result_type", conj_result_type_pyapi);
-    }
-
-    // U11: ==== COS           (x)
-    {
-        impl::populate_cos_dispatch_vectors();
-        using impl::cos_contig_dispatch_vector;
-        using impl::cos_output_typeid_vector;
-        using impl::cos_strided_dispatch_vector;
-
-        auto cos_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, cos_output_typeid_vector,
-                cos_contig_dispatch_vector, cos_strided_dispatch_vector);
-        };
-        m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto cos_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector);
-        };
-        m.def("_cos_result_type", cos_result_type_pyapi);
-    }
-
-    // U12: ==== COSH          (x)
-    {
-        impl::populate_cosh_dispatch_vectors();
-        using impl::cosh_contig_dispatch_vector;
-        using impl::cosh_output_typeid_vector;
-        using impl::cosh_strided_dispatch_vector;
-
-        auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, cosh_output_typeid_vector,
-                cosh_contig_dispatch_vector, cosh_strided_dispatch_vector);
-        };
-        m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto cosh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector);
-        };
-        m.def("_cosh_result_type", cosh_result_type_pyapi);
-    }
-
-    // B08: ==== DIVIDE        (x1, x2)
-    {
-        impl::populate_true_divide_dispatch_tables();
-        using impl::true_divide_contig_dispatch_table;
-        using impl::
-            true_divide_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::
-            true_divide_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::true_divide_output_id_table;
-        using impl::true_divide_strided_dispatch_table;
-
-        auto divide_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                const dpctl::tensor::usm_ndarray &src2,
-                                const dpctl::tensor::usm_ndarray &dst,
-                                sycl::queue &exec_q,
-                                const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, true_divide_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                true_divide_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                true_divide_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                true_divide_contig_matrix_contig_row_broadcast_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
-        };
-        auto divide_result_type_pyapi = [&](const py::dtype &dtype1,
-                                            const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               true_divide_output_id_table);
-        };
-        m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_divide_result_type", divide_result_type_pyapi, "");
-    }
-
-    // B09: ==== EQUAL         (x1, x2)
-    {
-        impl::populate_equal_dispatch_tables();
-        using impl::equal_contig_dispatch_table;
-        using impl::equal_output_id_table;
-        using impl::equal_strided_dispatch_table;
-
-        auto equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                               const dpctl::tensor::usm_ndarray &src2,
-                               const dpctl::tensor::usm_ndarray &dst,
-                               sycl::queue &exec_q,
-                               const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                           const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               equal_output_id_table);
-        };
-        m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_equal_result_type", equal_result_type_pyapi, "");
-    }
-
-    // U13: ==== EXP           (x)
-    {
-        impl::populate_exp_dispatch_vectors();
-        using impl::exp_contig_dispatch_vector;
-        using impl::exp_output_typeid_vector;
-        using impl::exp_strided_dispatch_vector;
-
-        auto exp_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, exp_output_typeid_vector,
-                exp_contig_dispatch_vector, exp_strided_dispatch_vector);
-        };
-        m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto exp_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector);
-        };
-        m.def("_exp_result_type", exp_result_type_pyapi);
-    }
-
-    // U14: ==== EXPM1         (x)
-    {
-        impl::populate_expm1_dispatch_vectors();
-        using impl::expm1_contig_dispatch_vector;
-        using impl::expm1_output_typeid_vector;
-        using impl::expm1_strided_dispatch_vector;
-
-        auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, expm1_output_typeid_vector,
-                expm1_contig_dispatch_vector, expm1_strided_dispatch_vector);
-        };
-        m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto expm1_result_type_pyapi = [&](const py::dtype dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              expm1_output_typeid_vector);
-        };
-        m.def("_expm1_result_type", expm1_result_type_pyapi);
-    }
-
-    // U15: ==== FLOOR         (x)
-    {
-        impl::populate_floor_dispatch_vectors();
-        using impl::floor_contig_dispatch_vector;
-        using impl::floor_output_typeid_vector;
-        using impl::floor_strided_dispatch_vector;
-
-        auto floor_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, floor_output_typeid_vector,
-                floor_contig_dispatch_vector, floor_strided_dispatch_vector);
-        };
-        m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto floor_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              floor_output_typeid_vector);
-        };
-        m.def("_floor_result_type", floor_result_type_pyapi);
-    }
-
-    // B10: ==== FLOOR_DIVIDE  (x1, x2)
-    {
-        impl::populate_floor_divide_dispatch_tables();
-        using impl::floor_divide_contig_dispatch_table;
-        using impl::floor_divide_output_id_table;
-        using impl::floor_divide_strided_dispatch_table;
-
-        auto floor_divide_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                      const dpctl::tensor::usm_ndarray &src2,
-                                      const dpctl::tensor::usm_ndarray &dst,
-                                      sycl::queue &exec_q,
-                                      const std::vector<sycl::event> &depends =
-                                          {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, floor_divide_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                floor_divide_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                floor_divide_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                  const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               floor_divide_output_id_table);
-        };
-        m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, "");
-    }
-
-    // B11: ==== GREATER       (x1, x2)
-    {
-        impl::populate_greater_dispatch_tables();
-        using impl::greater_contig_dispatch_table;
-        using impl::greater_output_id_table;
-        using impl::greater_strided_dispatch_table;
-
-        auto greater_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                 const dpctl::tensor::usm_ndarray &src2,
-                                 const dpctl::tensor::usm_ndarray &dst,
-                                 sycl::queue &exec_q,
-                                 const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, greater_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                greater_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                greater_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto greater_result_type_pyapi = [&](const py::dtype &dtype1,
-                                             const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               greater_output_id_table);
-        };
-        m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_greater_result_type", greater_result_type_pyapi, "");
-    }
-
-    // B12: ==== GREATER_EQUAL (x1, x2)
-    {
-        impl::populate_greater_equal_dispatch_tables();
-        using impl::greater_equal_contig_dispatch_table;
-        using impl::greater_equal_output_id_table;
-        using impl::greater_equal_strided_dispatch_table;
-
-        auto greater_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                       const dpctl::tensor::usm_ndarray &src2,
-                                       const dpctl::tensor::usm_ndarray &dst,
-                                       sycl::queue &exec_q,
-                                       const std::vector<sycl::event> &depends =
-                                           {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, greater_equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                greater_equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                greater_equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                   const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               greater_equal_output_id_table);
-        };
-        m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_greater_equal_result_type", greater_equal_result_type_pyapi,
-              "");
-    }
-
-    // U16: ==== IMAG        (x)
-    {
-        impl::populate_imag_dispatch_vectors();
-        using impl::imag_contig_dispatch_vector;
-        using impl::imag_output_typeid_vector;
-        using impl::imag_strided_dispatch_vector;
-
-        auto imag_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, imag_output_typeid_vector,
-                imag_contig_dispatch_vector, imag_strided_dispatch_vector);
-        };
-        m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto imag_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector);
-        };
-        m.def("_imag_result_type", imag_result_type_pyapi);
-    }
-
-    // U17: ==== ISFINITE    (x)
-    {
-        impl::populate_isfinite_dispatch_vectors();
-
-        using impl::isfinite_contig_dispatch_vector;
-        using impl::isfinite_output_typeid_vector;
-        using impl::isfinite_strided_dispatch_vector;
-        auto isfinite_pyapi =
-            [&](const dpctl::tensor::usm_ndarray &src,
-                const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends = {}) {
-                return py_unary_ufunc(src, dst, exec_q, depends,
-                                      isfinite_output_typeid_vector,
-                                      isfinite_contig_dispatch_vector,
-                                      isfinite_strided_dispatch_vector);
-            };
-        auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              isfinite_output_typeid_vector);
-        };
-        m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_isfinite_result_type", isfinite_result_type_pyapi, "");
-    }
-
-    // U18: ==== ISINF       (x)
-    {
-        impl::populate_isinf_dispatch_vectors();
-
-        using impl::isinf_contig_dispatch_vector;
-        using impl::isinf_output_typeid_vector;
-        using impl::isinf_strided_dispatch_vector;
-        auto isinf_pyapi = [&](const dpctl::tensor::usm_ndarray &src,
-                               const dpctl::tensor::usm_ndarray &dst,
-                               sycl::queue &exec_q,
-                               const std::vector<sycl::event> &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, isinf_output_typeid_vector,
-                isinf_contig_dispatch_vector, isinf_strided_dispatch_vector);
-        };
-        auto isinf_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              isinf_output_typeid_vector);
-        };
-        m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_isinf_result_type", isinf_result_type_pyapi, "");
-    }
-
-    // U19: ==== ISNAN       (x)
-    {
-        impl::populate_isnan_dispatch_vectors();
-
-        using impl::isnan_contig_dispatch_vector;
-        using impl::isnan_output_typeid_vector;
-        using impl::isnan_strided_dispatch_vector;
-        auto isnan_pyapi = [&](const dpctl::tensor::usm_ndarray &src,
-                               const dpctl::tensor::usm_ndarray &dst,
-                               sycl::queue &exec_q,
-                               const std::vector<sycl::event> &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, isnan_output_typeid_vector,
-                isnan_contig_dispatch_vector, isnan_strided_dispatch_vector);
-        };
-        auto isnan_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              isnan_output_typeid_vector);
-        };
-        m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_isnan_result_type", isnan_result_type_pyapi, "");
-    }
-
-    // B13: ==== LESS        (x1, x2)
-    {
-        impl::populate_less_dispatch_tables();
-        using impl::less_contig_dispatch_table;
-        using impl::less_output_id_table;
-        using impl::less_strided_dispatch_table;
-
-        auto less_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                              const dpctl::tensor::usm_ndarray &src2,
-                              const dpctl::tensor::usm_ndarray &dst,
-                              sycl::queue &exec_q,
-                              const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, less_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                less_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                less_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto less_result_type_pyapi = [&](const py::dtype &dtype1,
-                                          const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               less_output_id_table);
-        };
-        m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_less_result_type", less_result_type_pyapi, "");
-    }
-
-    // B14: ==== LESS_EQUAL  (x1, x2)
-    {
-        impl::populate_less_equal_dispatch_tables();
-        using impl::less_equal_contig_dispatch_table;
-        using impl::less_equal_output_id_table;
-        using impl::less_equal_strided_dispatch_table;
-
-        auto less_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                    const dpctl::tensor::usm_ndarray &src2,
-                                    const dpctl::tensor::usm_ndarray &dst,
-                                    sycl::queue &exec_q,
-                                    const std::vector<sycl::event> &depends =
-                                        {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, less_equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                less_equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                less_equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               less_equal_output_id_table);
-        };
-        m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_less_equal_result_type", less_equal_result_type_pyapi, "");
-    }
-
-    // U20: ==== LOG         (x)
-    {
-        impl::populate_log_dispatch_vectors();
-        using impl::log_contig_dispatch_vector;
-        using impl::log_output_typeid_vector;
-        using impl::log_strided_dispatch_vector;
-
-        auto log_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log_output_typeid_vector,
-                log_contig_dispatch_vector, log_strided_dispatch_vector);
-        };
-        m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto log_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, log_output_typeid_vector);
-        };
-        m.def("_log_result_type", log_result_type_pyapi);
-    }
-
-    // U21: ==== LOG1P       (x)
-    {
-        impl::populate_log1p_dispatch_vectors();
-        using impl::log1p_contig_dispatch_vector;
-        using impl::log1p_output_typeid_vector;
-        using impl::log1p_strided_dispatch_vector;
-
-        auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log1p_output_typeid_vector,
-                log1p_contig_dispatch_vector, log1p_strided_dispatch_vector);
-        };
-        m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto log1p_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              log1p_output_typeid_vector);
-        };
-        m.def("_log1p_result_type", log1p_result_type_pyapi);
-    }
-
-    // U22: ==== LOG2        (x)
-    {
-        impl::populate_log2_dispatch_vectors();
-
-        using impl::log2_contig_dispatch_vector;
-        using impl::log2_output_typeid_vector;
-        using impl::log2_strided_dispatch_vector;
-        auto log2_pyapi = [&](const dpctl::tensor::usm_ndarray &src,
-                              const dpctl::tensor::usm_ndarray &dst,
-                              sycl::queue &exec_q,
-                              const std::vector<sycl::event> &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log2_output_typeid_vector,
-                log2_contig_dispatch_vector, log2_strided_dispatch_vector);
-        };
-        auto log2_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector);
-        };
-        m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_log2_result_type", log2_result_type_pyapi, "");
-    }
-
-    // U23: ==== LOG10       (x)
-    {
-        impl::populate_log10_dispatch_vectors();
-
-        using impl::log10_contig_dispatch_vector;
-        using impl::log10_output_typeid_vector;
-        using impl::log10_strided_dispatch_vector;
-        auto log10_pyapi = [&](const dpctl::tensor::usm_ndarray &src,
-                               const dpctl::tensor::usm_ndarray &dst,
-                               sycl::queue &exec_q,
-                               const std::vector<sycl::event> &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log10_output_typeid_vector,
-                log10_contig_dispatch_vector, log10_strided_dispatch_vector);
-        };
-        auto log10_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              log10_output_typeid_vector);
-        };
-        m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_log10_result_type", log10_result_type_pyapi, "");
-    }
-
-    // B15: ==== LOGADDEXP   (x1, x2)
-    {
-        impl::populate_logaddexp_dispatch_tables();
-        using impl::logaddexp_contig_dispatch_table;
-        using impl::logaddexp_output_id_table;
-        using impl::logaddexp_strided_dispatch_table;
-
-        auto logaddexp_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                   const dpctl::tensor::usm_ndarray &src2,
-                                   const dpctl::tensor::usm_ndarray &dst,
-                                   sycl::queue &exec_q,
-                                   const std::vector<sycl::event> &depends =
-                                       {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logaddexp_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logaddexp_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logaddexp_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1,
-                                               const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logaddexp_output_id_table);
-        };
-        m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, "");
-    }
-
-    // B16: ==== LOGICAL_AND (x1, x2)
-    {
-        impl::populate_logical_and_dispatch_tables();
-        using impl::logical_and_contig_dispatch_table;
-        using impl::logical_and_output_id_table;
-        using impl::logical_and_strided_dispatch_table;
-
-        auto logical_and_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                     const dpctl::tensor::usm_ndarray &src2,
-                                     const dpctl::tensor::usm_ndarray &dst,
-                                     sycl::queue &exec_q,
-                                     const std::vector<sycl::event> &depends =
-                                         {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logical_and_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logical_and_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logical_and_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logical_and_output_id_table);
-        };
-        m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logical_and_result_type", logical_and_result_type_pyapi, "");
-    }
-
-    // U24: ==== LOGICAL_NOT (x)
-    {
-        impl::populate_logical_not_dispatch_vectors();
-        using impl::logical_not_contig_dispatch_vector;
-        using impl::logical_not_output_typeid_vector;
-        using impl::logical_not_strided_dispatch_vector;
-
-        auto logical_not_pyapi = [&](const arrayT &src, arrayT dst,
-                                     sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  logical_not_output_typeid_vector,
-                                  logical_not_contig_dispatch_vector,
-                                  logical_not_strided_dispatch_vector);
-        };
-        m.def("_logical_not", logical_not_pyapi, "", py::arg("src"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-
-        auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              logical_not_output_typeid_vector);
-        };
-        m.def("_logical_not_result_type", logical_not_result_type_pyapi);
-    }
-
-    // B17: ==== LOGICAL_OR  (x1, x2)
-    {
-        impl::populate_logical_or_dispatch_tables();
-        using impl::logical_or_contig_dispatch_table;
-        using impl::logical_or_output_id_table;
-        using impl::logical_or_strided_dispatch_table;
-
-        auto logical_or_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                    const dpctl::tensor::usm_ndarray &src2,
-                                    const dpctl::tensor::usm_ndarray &dst,
-                                    sycl::queue &exec_q,
-                                    const std::vector<sycl::event> &depends =
-                                        {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logical_or_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logical_or_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logical_or_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logical_or_output_id_table);
-        };
-        m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logical_or_result_type", logical_or_result_type_pyapi, "");
-    }
-
-    // B18: ==== LOGICAL_XOR (x1, x2)
-    {
-        impl::populate_logical_xor_dispatch_tables();
-        using impl::logical_xor_contig_dispatch_table;
-        using impl::logical_xor_output_id_table;
-        using impl::logical_xor_strided_dispatch_table;
-
-        auto logical_xor_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                     const dpctl::tensor::usm_ndarray &src2,
-                                     const dpctl::tensor::usm_ndarray &dst,
-                                     sycl::queue &exec_q,
-                                     const std::vector<sycl::event> &depends =
-                                         {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logical_xor_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logical_xor_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logical_xor_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logical_xor_output_id_table);
-        };
-        m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, "");
-    }
-
-    // B??: ==== MAXIMUM    (x1, x2)
-    {
-        impl::populate_maximum_dispatch_tables();
-        using impl::maximum_contig_dispatch_table;
-        using impl::maximum_output_id_table;
-        using impl::maximum_strided_dispatch_table;
-
-        auto maximum_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                 const dpctl::tensor::usm_ndarray &src2,
-                                 const dpctl::tensor::usm_ndarray &dst,
-                                 sycl::queue &exec_q,
-                                 const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, maximum_output_id_table,
-                // function pointers to handle operation on contiguous
-                // arrays (pointers may be nullptr)
-                maximum_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays
-                // (most general case)
-                maximum_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix
-                // and c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix
-                // and c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto maximum_result_type_pyapi = [&](const py::dtype &dtype1,
-                                             const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               maximum_output_id_table);
-        };
-        m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_maximum_result_type", maximum_result_type_pyapi, "");
-    }
-
-    // B??: ==== MINIMUM    (x1, x2)
-    {
-        impl::populate_minimum_dispatch_tables();
-        using impl::minimum_contig_dispatch_table;
-        using impl::minimum_output_id_table;
-        using impl::minimum_strided_dispatch_table;
-
-        auto minimum_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                 const dpctl::tensor::usm_ndarray &src2,
-                                 const dpctl::tensor::usm_ndarray &dst,
-                                 sycl::queue &exec_q,
-                                 const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, minimum_output_id_table,
-                // function pointers to handle operation on contiguous
-                // arrays (pointers may be nullptr)
-                minimum_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays
-                // (most general case)
-                minimum_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix
-                // and c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto minimum_result_type_pyapi = [&](const py::dtype &dtype1,
-                                             const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               minimum_output_id_table);
-        };
-        m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_minimum_result_type", minimum_result_type_pyapi, "");
-    }
-
-    // B19: ==== MULTIPLY    (x1, x2)
-    {
-        impl::populate_multiply_dispatch_tables();
-        using impl::multiply_contig_dispatch_table;
-        using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::multiply_output_id_table;
-        using impl::multiply_strided_dispatch_table;
-
-        auto multiply_pyapi =
-            [&](const dpctl::tensor::usm_ndarray &src1,
-                const dpctl::tensor::usm_ndarray &src2,
-                const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends = {}) {
-                return py_binary_ufunc(
-                    src1, src2, dst, exec_q, depends, multiply_output_id_table,
-                    // function pointers to handle operation on contiguous
-                    // arrays (pointers may be nullptr)
-                    multiply_contig_dispatch_table,
-                    // function pointers to handle operation on strided arrays
-                    // (most general case)
-                    multiply_strided_dispatch_table,
-                    // function pointers to handle operation of c-contig matrix
-                    // and c-contig row with broadcasting (may be nullptr)
-                    multiply_contig_matrix_contig_row_broadcast_dispatch_table,
-                    // function pointers to handle operation of c-contig matrix
-                    // and c-contig row with broadcasting (may be nullptr)
-                    multiply_contig_row_contig_matrix_broadcast_dispatch_table);
-            };
-        auto multiply_result_type_pyapi = [&](const py::dtype &dtype1,
-                                              const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               multiply_output_id_table);
-        };
-        m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_multiply_result_type", multiply_result_type_pyapi, "");
-
-        using impl::multiply_inplace_contig_dispatch_table;
-        using impl::multiply_inplace_row_matrix_dispatch_table;
-        using impl::multiply_inplace_strided_dispatch_table;
-
-        auto multiply_inplace_pyapi =
-            [&](const dpctl::tensor::usm_ndarray &src,
-                const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends = {}) {
-                return py_binary_inplace_ufunc(
-                    src, dst, exec_q, depends, multiply_output_id_table,
-                    // function pointers to handle inplace operation on
-                    // contiguous arrays (pointers may be nullptr)
-                    multiply_inplace_contig_dispatch_table,
-                    // function pointers to handle inplace operation on strided
-                    // arrays (most general case)
-                    multiply_inplace_strided_dispatch_table,
-                    // function pointers to handle inplace operation on
-                    // c-contig matrix with c-contig row with broadcasting
-                    // (may be nullptr)
-                    multiply_inplace_row_matrix_dispatch_table);
-            };
-        m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-
-    // U25: ==== NEGATIVE    (x)
-    {
-        impl::populate_negative_dispatch_vectors();
-        using impl::negative_contig_dispatch_vector;
-        using impl::negative_output_typeid_vector;
-        using impl::negative_strided_dispatch_vector;
-
-        auto negative_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                  sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  negative_output_typeid_vector,
-                                  negative_contig_dispatch_vector,
-                                  negative_strided_dispatch_vector);
-        };
-        m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto negative_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              negative_output_typeid_vector);
-        };
-        m.def("_negative_result_type", negative_result_type_pyapi);
-    }
-
-    // B20: ==== NOT_EQUAL   (x1, x2)
-    {
-        impl::populate_not_equal_dispatch_tables();
-        using impl::not_equal_contig_dispatch_table;
-        using impl::not_equal_output_id_table;
-        using impl::not_equal_strided_dispatch_table;
-
-        auto not_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                   const dpctl::tensor::usm_ndarray &src2,
-                                   const dpctl::tensor::usm_ndarray &dst,
-                                   sycl::queue &exec_q,
-                                   const std::vector<sycl::event> &depends =
-                                       {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, not_equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                not_equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                not_equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                               const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               not_equal_output_id_table);
-        };
-        m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_not_equal_result_type", not_equal_result_type_pyapi, "");
-    }
-
-    // U26: ==== POSITIVE    (x)
-    {
-        impl::populate_positive_dispatch_vectors();
-        using impl::positive_contig_dispatch_vector;
-        using impl::positive_output_typeid_vector;
-        using impl::positive_strided_dispatch_vector;
-
-        auto positive_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                  sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  positive_output_typeid_vector,
-                                  positive_contig_dispatch_vector,
-                                  positive_strided_dispatch_vector);
-        };
-        m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto positive_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              positive_output_typeid_vector);
-        };
-        m.def("_positive_result_type", positive_result_type_pyapi);
-    }
-
-    // B21: ==== POW         (x1, x2)
-    {
-        impl::populate_pow_dispatch_tables();
-        using impl::pow_contig_dispatch_table;
-        using impl::pow_output_id_table;
-        using impl::pow_strided_dispatch_table;
-
-        auto pow_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                             const dpctl::tensor::usm_ndarray &src2,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, pow_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                pow_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                pow_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto pow_result_type_pyapi = [&](const py::dtype &dtype1,
-                                         const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               pow_output_id_table);
-        };
-        m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_pow_result_type", pow_result_type_pyapi, "");
-    }
-
-    // U??: ==== PROJ        (x)
-    {
-        impl::populate_proj_dispatch_vectors();
-        using impl::proj_contig_dispatch_vector;
-        using impl::proj_output_typeid_vector;
-        using impl::proj_strided_dispatch_vector;
-
-        auto proj_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, proj_output_typeid_vector,
-                proj_contig_dispatch_vector, proj_strided_dispatch_vector);
-        };
-        m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto proj_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector);
-        };
-        m.def("_proj_result_type", proj_result_type_pyapi);
-    }
-
-    // U27: ==== REAL        (x)
-    {
-        impl::populate_real_dispatch_vectors();
-        using impl::real_contig_dispatch_vector;
-        using impl::real_output_typeid_vector;
-        using impl::real_strided_dispatch_vector;
-
-        auto real_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, real_output_typeid_vector,
-                real_contig_dispatch_vector, real_strided_dispatch_vector);
-        };
-        m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto real_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, real_output_typeid_vector);
-        };
-        m.def("_real_result_type", real_result_type_pyapi);
-    }
-
-    // B22: ==== REMAINDER   (x1, x2)
-    {
-        impl::populate_remainder_dispatch_tables();
-        using impl::remainder_contig_dispatch_table;
-        using impl::remainder_output_id_table;
-        using impl::remainder_strided_dispatch_table;
-
-        auto remainder_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                                   const dpctl::tensor::usm_ndarray &src2,
-                                   const dpctl::tensor::usm_ndarray &dst,
-                                   sycl::queue &exec_q,
-                                   const std::vector<sycl::event> &depends =
-                                       {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, remainder_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                remainder_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                remainder_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto remainder_result_type_pyapi = [&](const py::dtype &dtype1,
-                                               const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               remainder_output_id_table);
-        };
-        m.def("_remainder", remainder_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_remainder_result_type", remainder_result_type_pyapi, "");
-    }
-
-    // U28: ==== ROUND       (x)
-    {
-        impl::populate_round_dispatch_vectors();
-        using impl::round_contig_dispatch_vector;
-        using impl::round_output_typeid_vector;
-        using impl::round_strided_dispatch_vector;
-
-        auto round_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, round_output_typeid_vector,
-                round_contig_dispatch_vector, round_strided_dispatch_vector);
-        };
-        m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto round_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              round_output_typeid_vector);
-        };
-        m.def("_round_result_type", round_result_type_pyapi);
-    }
-
-    // U29: ==== SIGN        (x)
-    {
-        impl::populate_sign_dispatch_vectors();
-        using impl::sign_contig_dispatch_vector;
-        using impl::sign_output_typeid_vector;
-        using impl::sign_strided_dispatch_vector;
-
-        auto sign_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sign_output_typeid_vector,
-                sign_contig_dispatch_vector, sign_strided_dispatch_vector);
-        };
-        m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sign_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector);
-        };
-        m.def("_sign_result_type", sign_result_type_pyapi);
-    }
-
-    // ==== SIGNBIT        (x)
-    {
-        impl::populate_signbit_dispatch_vectors();
-        using impl::signbit_contig_dispatch_vector;
-        using impl::signbit_output_typeid_vector;
-        using impl::signbit_strided_dispatch_vector;
-
-        auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                 sycl::queue &exec_q,
-                                 const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  signbit_output_typeid_vector,
-                                  signbit_contig_dispatch_vector,
-                                  signbit_strided_dispatch_vector);
-        };
-        m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto signbit_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              signbit_output_typeid_vector);
-        };
-        m.def("_signbit_result_type", signbit_result_type_pyapi);
-    }
-
-    // U30: ==== SIN         (x)
-    {
-        impl::populate_sin_dispatch_vectors();
-        using impl::sin_contig_dispatch_vector;
-        using impl::sin_output_typeid_vector;
-        using impl::sin_strided_dispatch_vector;
-
-        auto sin_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sin_output_typeid_vector,
-                sin_contig_dispatch_vector, sin_strided_dispatch_vector);
-        };
-        m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sin_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector);
-        };
-        m.def("_sin_result_type", sin_result_type_pyapi);
-    }
-    // U31: ==== SINH        (x)
-    {
-        impl::populate_sinh_dispatch_vectors();
-        using impl::sinh_contig_dispatch_vector;
-        using impl::sinh_output_typeid_vector;
-        using impl::sinh_strided_dispatch_vector;
-
-        auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sinh_output_typeid_vector,
-                sinh_contig_dispatch_vector, sinh_strided_dispatch_vector);
-        };
-        m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sinh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector);
-        };
-        m.def("_sinh_result_type", sinh_result_type_pyapi);
-    }
-
-    // U32: ==== SQUARE      (x)
-    {
-        impl::populate_square_dispatch_vectors();
-        using impl::square_contig_dispatch_vector;
-        using impl::square_output_typeid_vector;
-        using impl::square_strided_dispatch_vector;
-
-        auto square_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                sycl::queue &exec_q,
-                                const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, square_output_typeid_vector,
-                square_contig_dispatch_vector, square_strided_dispatch_vector);
-        };
-        m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto square_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              square_output_typeid_vector);
-        };
-        m.def("_square_result_type", square_result_type_pyapi);
-    }
-
-    // U33: ==== SQRT        (x)
-    {
-        impl::populate_sqrt_dispatch_vectors();
-        using impl::sqrt_contig_dispatch_vector;
-        using impl::sqrt_output_typeid_vector;
-        using impl::sqrt_strided_dispatch_vector;
-
-        auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sqrt_output_typeid_vector,
-                sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector);
-        };
-        m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector);
-        };
-        m.def("_sqrt_result_type", sqrt_result_type_pyapi);
-    }
-
-    // B23: ==== SUBTRACT    (x1, x2)
-    {
-        impl::populate_subtract_dispatch_tables();
-        using impl::subtract_contig_dispatch_table;
-        using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::subtract_output_id_table;
-        using impl::subtract_strided_dispatch_table;
-
-        auto subtract_pyapi =
-            [&](const dpctl::tensor::usm_ndarray &src1,
-                const dpctl::tensor::usm_ndarray &src2,
-                const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends = {}) {
-                return py_binary_ufunc(
-                    src1, src2, dst, exec_q, depends, subtract_output_id_table,
-                    // function pointers to handle operation on contiguous
-                    // arrays (pointers may be nullptr)
-                    subtract_contig_dispatch_table,
-                    // function pointers to handle operation on strided arrays
-                    // (most general case)
-                    subtract_strided_dispatch_table,
-                    // function pointers to handle operation of c-contig matrix
-                    // and c-contig row with broadcasting (may be nullptr)
-                    subtract_contig_matrix_contig_row_broadcast_dispatch_table,
-                    // function pointers to handle operation of c-contig matrix
-                    // and c-contig row with broadcasting (may be nullptr)
-                    subtract_contig_row_contig_matrix_broadcast_dispatch_table);
-            };
-        auto subtract_result_type_pyapi = [&](const py::dtype &dtype1,
-                                              const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               subtract_output_id_table);
-        };
-        m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_subtract_result_type", subtract_result_type_pyapi, "");
-
-        using impl::subtract_inplace_contig_dispatch_table;
-        using impl::subtract_inplace_row_matrix_dispatch_table;
-        using impl::subtract_inplace_strided_dispatch_table;
-
-        auto subtract_inplace_pyapi =
-            [&](const dpctl::tensor::usm_ndarray &src,
-                const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends = {}) {
-                return py_binary_inplace_ufunc(
-                    src, dst, exec_q, depends, subtract_output_id_table,
-                    // function pointers to handle inplace operation on
-                    // contiguous arrays (pointers may be nullptr)
-                    subtract_inplace_contig_dispatch_table,
-                    // function pointers to handle inplace operation on strided
-                    // arrays (most general case)
-                    subtract_inplace_strided_dispatch_table,
-                    // function pointers to handle inplace operation on
-                    // c-contig matrix with c-contig row with broadcasting
-                    // (may be nullptr)
-                    subtract_inplace_row_matrix_dispatch_table);
-            };
-        m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-
-    // U34: ==== TAN         (x)
-    {
-        impl::populate_tan_dispatch_vectors();
-        using impl::tan_contig_dispatch_vector;
-        using impl::tan_output_typeid_vector;
-        using impl::tan_strided_dispatch_vector;
-
-        auto tan_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, tan_output_typeid_vector,
-                tan_contig_dispatch_vector, tan_strided_dispatch_vector);
-        };
-        m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto tan_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector);
-        };
-        m.def("_tan_result_type", tan_result_type_pyapi);
-    }
-
-    // U35: ==== TANH        (x)
-    {
-        impl::populate_tanh_dispatch_vectors();
-        using impl::tanh_contig_dispatch_vector;
-        using impl::tanh_output_typeid_vector;
-        using impl::tanh_strided_dispatch_vector;
-
-        auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, tanh_output_typeid_vector,
-                tanh_contig_dispatch_vector, tanh_strided_dispatch_vector);
-        };
-        m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto tanh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector);
-        };
-        m.def("_tanh_result_type", tanh_result_type_pyapi);
-    }
-
-    // U36: ==== TRUNC       (x)
-    {
-        impl::populate_trunc_dispatch_vectors();
-        using impl::trunc_contig_dispatch_vector;
-        using impl::trunc_output_typeid_vector;
-        using impl::trunc_strided_dispatch_vector;
-
-        auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, trunc_output_typeid_vector,
-                trunc_contig_dispatch_vector, trunc_strided_dispatch_vector);
-        };
-        m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto trunc_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              trunc_output_typeid_vector);
-        };
-        m.def("_trunc_result_type", trunc_result_type_pyapi);
-    }
-
-    // B24: ==== HYPOT       (x1, x2)
-    {
-        impl::populate_hypot_dispatch_tables();
-        using impl::hypot_contig_dispatch_table;
-        using impl::hypot_output_id_table;
-        using impl::hypot_strided_dispatch_table;
-
-        auto hypot_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
-                               const dpctl::tensor::usm_ndarray &src2,
-                               const dpctl::tensor::usm_ndarray &dst,
-                               sycl::queue &exec_q,
-                               const std::vector<sycl::event> &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, hypot_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                hypot_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                hypot_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto hypot_result_type_pyapi = [&](const py::dtype &dtype1,
-                                           const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               hypot_output_id_table);
-        };
-        m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_hypot_result_type", hypot_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp
new file mode 100644
index 0000000000..4b3e8b635b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "abs.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/abs.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U01: ==== ABS   (x)
+namespace impl
+{
+
+namespace abs_fn_ns = dpctl::tensor::kernels::abs;
+
+static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types];
+static int abs_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    abs_strided_dispatch_vector[td_ns::num_types];
+
+void populate_abs_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = abs_fn_ns;
+
+    using fn_ns::AbsContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AbsContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(abs_contig_dispatch_vector);
+
+    using fn_ns::AbsStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AbsStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(abs_strided_dispatch_vector);
+
+    using fn_ns::AbsTypeMapFactory;
+    DispatchVectorBuilder<int, AbsTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(abs_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_abs(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_abs_dispatch_vectors();
+        using impl::abs_contig_dispatch_vector;
+        using impl::abs_output_typeid_vector;
+        using impl::abs_strided_dispatch_vector;
+
+        auto abs_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, abs_output_typeid_vector,
+                abs_contig_dispatch_vector, abs_strided_dispatch_vector);
+        };
+        m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto abs_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector);
+        };
+        m.def("_abs_result_type", abs_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp
new file mode 100644
index 0000000000..d09eafc6bd
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_abs(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp
new file mode 100644
index 0000000000..011cc052fb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "acos.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/acos.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U02: ==== ACOS   (x)
+namespace impl
+{
+
+namespace acos_fn_ns = dpctl::tensor::kernels::acos;
+
+static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types];
+static int acos_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    acos_strided_dispatch_vector[td_ns::num_types];
+
+void populate_acos_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = acos_fn_ns;
+
+    using fn_ns::AcosContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcosContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(acos_contig_dispatch_vector);
+
+    using fn_ns::AcosStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcosStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(acos_strided_dispatch_vector);
+
+    using fn_ns::AcosTypeMapFactory;
+    DispatchVectorBuilder<int, AcosTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(acos_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_acos(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_acos_dispatch_vectors();
+        using impl::acos_contig_dispatch_vector;
+        using impl::acos_output_typeid_vector;
+        using impl::acos_strided_dispatch_vector;
+
+        auto acos_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, acos_output_typeid_vector,
+                acos_contig_dispatch_vector, acos_strided_dispatch_vector);
+        };
+        m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto acos_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector);
+        };
+        m.def("_acos_result_type", acos_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp
new file mode 100644
index 0000000000..3a43d4087c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_acos(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp
new file mode 100644
index 0000000000..526bd44f12
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "acosh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/acosh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U03: ==== ACOSH   (x)
+namespace impl
+{
+
+namespace acosh_fn_ns = dpctl::tensor::kernels::acosh;
+
+static unary_contig_impl_fn_ptr_t
+    acosh_contig_dispatch_vector[td_ns::num_types];
+static int acosh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    acosh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_acosh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = acosh_fn_ns;
+
+    using fn_ns::AcoshContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcoshContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector);
+
+    using fn_ns::AcoshStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcoshStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector);
+
+    using fn_ns::AcoshTypeMapFactory;
+    DispatchVectorBuilder<int, AcoshTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(acosh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_acosh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_acosh_dispatch_vectors();
+        using impl::acosh_contig_dispatch_vector;
+        using impl::acosh_output_typeid_vector;
+        using impl::acosh_strided_dispatch_vector;
+
+        auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, acosh_output_typeid_vector,
+                acosh_contig_dispatch_vector, acosh_strided_dispatch_vector);
+        };
+        m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto acosh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              acosh_output_typeid_vector);
+        };
+        m.def("_acosh_result_type", acosh_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp
new file mode 100644
index 0000000000..dd13ba886c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_acosh(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp
new file mode 100644
index 0000000000..247b8e0283
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp
@@ -0,0 +1,229 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "add.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/add.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B01: ===== ADD (x1, x2)
+namespace impl
+{
+
+namespace add_fn_ns = dpctl::tensor::kernels::add;
+
+static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+static int add_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    add_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// add(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+// add(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_add_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = add_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::AddTypeMapFactory;
+    DispatchTableBuilder<int, AddTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(add_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::AddStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, AddStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(add_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::AddContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, AddContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(add_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::AddContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        AddContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        add_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::AddContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        AddContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        add_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::AddInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         AddInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::AddInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         AddInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::AddInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         AddInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table);
+};
+
+} // namespace impl
+
+void init_add(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_add_dispatch_tables();
+        using impl::add_contig_dispatch_table;
+        using impl::add_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::add_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::add_output_id_table;
+        using impl::add_strided_dispatch_table;
+
+        auto add_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, add_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                add_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                add_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                add_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                add_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto add_result_type_pyapi = [&](const py::dtype &dtype1,
+                                         const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               add_output_id_table);
+        };
+        m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_add_result_type", add_result_type_pyapi, "");
+
+        using impl::add_inplace_contig_dispatch_table;
+        using impl::add_inplace_row_matrix_dispatch_table;
+        using impl::add_inplace_strided_dispatch_table;
+
+        auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, add_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                add_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                add_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                add_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp
new file mode 100644
index 0000000000..5f88bfaa04
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_add(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp
new file mode 100644
index 0000000000..14ef5e2665
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "asin.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/asin.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U04: ==== ASIN   (x)
+namespace impl
+{
+
+namespace asin_fn_ns = dpctl::tensor::kernels::asin;
+
+static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types];
+static int asin_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    asin_strided_dispatch_vector[td_ns::num_types];
+
+void populate_asin_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = asin_fn_ns;
+
+    using fn_ns::AsinContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(asin_contig_dispatch_vector);
+
+    using fn_ns::AsinStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(asin_strided_dispatch_vector);
+
+    using fn_ns::AsinTypeMapFactory;
+    DispatchVectorBuilder<int, AsinTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(asin_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_asin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_asin_dispatch_vectors();
+        using impl::asin_contig_dispatch_vector;
+        using impl::asin_output_typeid_vector;
+        using impl::asin_strided_dispatch_vector;
+
+        auto asin_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, asin_output_typeid_vector,
+                asin_contig_dispatch_vector, asin_strided_dispatch_vector);
+        };
+        m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto asin_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector);
+        };
+        m.def("_asin_result_type", asin_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp
new file mode 100644
index 0000000000..0beed1d19c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_asin(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp
new file mode 100644
index 0000000000..dd0b4e62f7
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "asinh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/asinh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U05: ==== ASINH   (x)
+namespace impl
+{
+
+namespace asinh_fn_ns = dpctl::tensor::kernels::asinh;
+
+static unary_contig_impl_fn_ptr_t
+    asinh_contig_dispatch_vector[td_ns::num_types];
+static int asinh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    asinh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_asinh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = asinh_fn_ns;
+
+    using fn_ns::AsinhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector);
+
+    using fn_ns::AsinhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector);
+
+    using fn_ns::AsinhTypeMapFactory;
+    DispatchVectorBuilder<int, AsinhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(asinh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_asinh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_asinh_dispatch_vectors();
+        using impl::asinh_contig_dispatch_vector;
+        using impl::asinh_output_typeid_vector;
+        using impl::asinh_strided_dispatch_vector;
+
+        auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, asinh_output_typeid_vector,
+                asinh_contig_dispatch_vector, asinh_strided_dispatch_vector);
+        };
+        m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto asinh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              asinh_output_typeid_vector);
+        };
+        m.def("_asinh_result_type", asinh_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp
new file mode 100644
index 0000000000..22cc37b2d8
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_asinh(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp
new file mode 100644
index 0000000000..81ff00c46a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "atan.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/atan.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U06: ==== ATAN   (x)
+namespace impl
+{
+
+namespace atan_fn_ns = dpctl::tensor::kernels::atan;
+
+static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types];
+static int atan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    atan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_atan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atan_fn_ns;
+
+    using fn_ns::AtanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(atan_contig_dispatch_vector);
+
+    using fn_ns::AtanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(atan_strided_dispatch_vector);
+
+    using fn_ns::AtanTypeMapFactory;
+    DispatchVectorBuilder<int, AtanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(atan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_atan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atan_dispatch_vectors();
+        using impl::atan_contig_dispatch_vector;
+        using impl::atan_output_typeid_vector;
+        using impl::atan_strided_dispatch_vector;
+
+        auto atan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, atan_output_typeid_vector,
+                atan_contig_dispatch_vector, atan_strided_dispatch_vector);
+        };
+        m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto atan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector);
+        };
+        m.def("_atan_result_type", atan_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp
new file mode 100644
index 0000000000..86df06699c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_atan(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp
new file mode 100644
index 0000000000..d12a4ff540
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "atan2.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/atan2.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B02: ===== ATAN2 (x1, x2)
+namespace impl
+{
+namespace atan2_fn_ns = dpctl::tensor::kernels::atan2;
+
+static binary_contig_impl_fn_ptr_t
+    atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int atan2_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_atan2_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atan2_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::Atan2TypeMapFactory;
+    DispatchTableBuilder<int, Atan2TypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(atan2_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::Atan2StridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, Atan2StridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(atan2_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::Atan2ContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, Atan2ContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(atan2_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_atan2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atan2_dispatch_tables();
+        using impl::atan2_contig_dispatch_table;
+        using impl::atan2_output_id_table;
+        using impl::atan2_strided_dispatch_table;
+
+        auto atan2_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, atan2_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                atan2_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                atan2_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto atan2_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               atan2_output_id_table);
+        };
+        m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_atan2_result_type", atan2_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp
new file mode 100644
index 0000000000..f369d12208
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_atan2(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp
new file mode 100644
index 0000000000..c42769b8d0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "atanh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/atanh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U07: ==== ATANH   (x)
+namespace impl
+{
+
+namespace atanh_fn_ns = dpctl::tensor::kernels::atanh;
+
+static unary_contig_impl_fn_ptr_t
+    atanh_contig_dispatch_vector[td_ns::num_types];
+static int atanh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    atanh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_atanh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atanh_fn_ns;
+
+    using fn_ns::AtanhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector);
+
+    using fn_ns::AtanhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector);
+
+    using fn_ns::AtanhTypeMapFactory;
+    DispatchVectorBuilder<int, AtanhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(atanh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_atanh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atanh_dispatch_vectors();
+        using impl::atanh_contig_dispatch_vector;
+        using impl::atanh_output_typeid_vector;
+        using impl::atanh_strided_dispatch_vector;
+
+        auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, atanh_output_typeid_vector,
+                atanh_contig_dispatch_vector, atanh_strided_dispatch_vector);
+        };
+        m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto atanh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              atanh_output_typeid_vector);
+        };
+        m.def("_atanh_result_type", atanh_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp
new file mode 100644
index 0000000000..ba2930d80e
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_atanh(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
new file mode 100644
index 0000000000..f86f5112cd
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
@@ -0,0 +1,190 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "bitwise_and.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/bitwise_and.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B03: ===== BITWISE_AND (x1, x2)
+namespace impl
+{
+namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_and_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_and_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_bitwise_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseAndTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseAndInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseAndInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_and_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseAndInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseAndInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_and_inplace_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_bitwise_and(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_and_dispatch_tables();
+        using impl::bitwise_and_contig_dispatch_table;
+        using impl::bitwise_and_output_id_table;
+        using impl::bitwise_and_strided_dispatch_table;
+
+        auto bitwise_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_and_output_id_table);
+        };
+        m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, "");
+
+        using impl::bitwise_and_inplace_contig_dispatch_table;
+        using impl::bitwise_and_inplace_strided_dispatch_table;
+
+        auto bitwise_and_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends, bitwise_and_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_and_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_and_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_and_inplace", bitwise_and_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
new file mode 100644
index 0000000000..682b337efd
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_bitwise_and(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
new file mode 100644
index 0000000000..29a04cff38
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
@@ -0,0 +1,123 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "bitwise_invert.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/bitwise_invert.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U08: ===== BITWISE_INVERT        (x)
+namespace impl
+{
+
+namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert;
+
+static unary_contig_impl_fn_ptr_t
+    bitwise_invert_contig_dispatch_vector[td_ns::num_types];
+static int bitwise_invert_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    bitwise_invert_strided_dispatch_vector[td_ns::num_types];
+
+void populate_bitwise_invert_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_invert_fn_ns;
+
+    using fn_ns::BitwiseInvertContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t,
+                          BitwiseInvertContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector);
+
+    using fn_ns::BitwiseInvertStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t,
+                          BitwiseInvertStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector);
+
+    using fn_ns::BitwiseInvertTypeMapFactory;
+    DispatchVectorBuilder<int, BitwiseInvertTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_bitwise_invert(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_invert_dispatch_vectors();
+        using impl::bitwise_invert_contig_dispatch_vector;
+        using impl::bitwise_invert_output_typeid_vector;
+        using impl::bitwise_invert_strided_dispatch_vector;
+
+        auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                        sycl::queue &exec_q,
+                                        const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  bitwise_invert_output_typeid_vector,
+                                  bitwise_invert_contig_dispatch_vector,
+                                  bitwise_invert_strided_dispatch_vector);
+        };
+        m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(
+                dtype, bitwise_invert_output_typeid_vector);
+        };
+        m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
new file mode 100644
index 0000000000..5b5d8398dc
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_bitwise_invert(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
new file mode 100644
index 0000000000..7969bc4ffa
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
@@ -0,0 +1,200 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "bitwise_left_shift.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/bitwise_left_shift.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B04: ===== BITWISE_LEFT_SHIFT (x1, x2)
+namespace impl
+{
+namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_left_shift_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+static int bitwise_left_shift_output_id_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_left_shift_strided_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_left_shift_inplace_contig_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_left_shift_inplace_strided_dispatch_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+
+void populate_bitwise_left_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_left_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseLeftShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseLeftShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseLeftShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseLeftShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseLeftShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseLeftShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseLeftShiftInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseLeftShiftInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        bitwise_left_shift_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseLeftShiftInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseLeftShiftInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        bitwise_left_shift_inplace_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_bitwise_left_shift(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_left_shift_dispatch_tables();
+        using impl::bitwise_left_shift_contig_dispatch_table;
+        using impl::bitwise_left_shift_output_id_table;
+        using impl::bitwise_left_shift_strided_dispatch_table;
+
+        auto bitwise_left_shift_pyapi = [&](const arrayT &src1,
+                                            const arrayT &src2,
+                                            const arrayT &dst,
+                                            sycl::queue &exec_q,
+                                            const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_left_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_left_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_left_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_left_shift_result_type_pyapi =
+            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
+                return py_binary_ufunc_result_type(
+                    dtype1, dtype2, bitwise_left_shift_output_id_table);
+            };
+        m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_left_shift_result_type",
+              bitwise_left_shift_result_type_pyapi, "");
+
+        using impl::bitwise_left_shift_inplace_contig_dispatch_table;
+        using impl::bitwise_left_shift_inplace_strided_dispatch_table;
+
+        auto bitwise_left_shift_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends,
+                    bitwise_left_shift_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_left_shift_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_left_shift_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_left_shift_inplace", bitwise_left_shift_inplace_pyapi,
+              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
new file mode 100644
index 0000000000..9edcba43ab
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_bitwise_left_shift(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
new file mode 100644
index 0000000000..33a57f907c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
@@ -0,0 +1,190 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "bitwise_or.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/bitwise_or.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B05: ===== BITWISE_OR (x1, x2)
+namespace impl
+{
+namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_or_inplace_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_or_inplace_strided_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+void populate_bitwise_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseOrTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseOrInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseOrInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_or_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseOrInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseOrInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_or_inplace_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_bitwise_or(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_or_dispatch_tables();
+        using impl::bitwise_or_contig_dispatch_table;
+        using impl::bitwise_or_output_id_table;
+        using impl::bitwise_or_strided_dispatch_table;
+
+        auto bitwise_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_or_output_id_table);
+        };
+        m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, "");
+
+        using impl::bitwise_or_inplace_contig_dispatch_table;
+        using impl::bitwise_or_inplace_strided_dispatch_table;
+
+        auto bitwise_or_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends, bitwise_or_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_or_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_or_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_or_inplace", bitwise_or_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
new file mode 100644
index 0000000000..7603ed8277
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_bitwise_or(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
new file mode 100644
index 0000000000..3847204b1f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
@@ -0,0 +1,201 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "bitwise_right_shift.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/bitwise_right_shift.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2)
+namespace impl
+{
+namespace bitwise_right_shift_fn_ns =
+    dpctl::tensor::kernels::bitwise_right_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_right_shift_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static int bitwise_right_shift_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_right_shift_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_right_shift_inplace_contig_dispatch_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_right_shift_inplace_strided_dispatch_table[td_ns::num_types]
+                                                      [td_ns::num_types];
+
+void populate_bitwise_right_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_right_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseRightShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseRightShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseRightShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseRightShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseRightShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseRightShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseRightShiftInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseRightShiftInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        bitwise_right_shift_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseRightShiftInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseRightShiftInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        bitwise_right_shift_inplace_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_bitwise_right_shift(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_right_shift_dispatch_tables();
+        using impl::bitwise_right_shift_contig_dispatch_table;
+        using impl::bitwise_right_shift_output_id_table;
+        using impl::bitwise_right_shift_strided_dispatch_table;
+
+        auto bitwise_right_shift_pyapi = [&](const arrayT &src1,
+                                             const arrayT &src2,
+                                             const arrayT &dst,
+                                             sycl::queue &exec_q,
+                                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_right_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_right_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_right_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_right_shift_result_type_pyapi =
+            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
+                return py_binary_ufunc_result_type(
+                    dtype1, dtype2, bitwise_right_shift_output_id_table);
+            };
+        m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_right_shift_result_type",
+              bitwise_right_shift_result_type_pyapi, "");
+
+        using impl::bitwise_right_shift_inplace_contig_dispatch_table;
+        using impl::bitwise_right_shift_inplace_strided_dispatch_table;
+
+        auto bitwise_right_shift_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends,
+                    bitwise_right_shift_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_right_shift_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_right_shift_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_right_shift_inplace", bitwise_right_shift_inplace_pyapi,
+              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
new file mode 100644
index 0000000000..5ce2bca4e7
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_bitwise_right_shift(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
new file mode 100644
index 0000000000..71d606766f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
@@ -0,0 +1,190 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "bitwise_xor.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/bitwise_xor.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B07: ===== BITWISE_XOR (x1, x2)
+namespace impl
+{
+namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_xor_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_xor_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_bitwise_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseXorTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseXorInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseXorInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_xor_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseXorInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseXorInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_xor_inplace_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_bitwise_xor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_xor_dispatch_tables();
+        using impl::bitwise_xor_contig_dispatch_table;
+        using impl::bitwise_xor_output_id_table;
+        using impl::bitwise_xor_strided_dispatch_table;
+
+        auto bitwise_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_xor_output_id_table);
+        };
+        m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, "");
+
+        using impl::bitwise_xor_inplace_contig_dispatch_table;
+        using impl::bitwise_xor_inplace_strided_dispatch_table;
+
+        auto bitwise_xor_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends, bitwise_xor_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_xor_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_xor_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_xor_inplace", bitwise_xor_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
new file mode 100644
index 0000000000..7b092aadda
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_bitwise_xor(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp
new file mode 100644
index 0000000000..b42f234c0d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "cbrt.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/cbrt.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U37: ==== CBRT   (x)
+namespace impl
+{
+
+namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt;
+
+static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types];
+static int cbrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cbrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cbrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cbrt_fn_ns;
+
+    using fn_ns::CbrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CbrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector);
+
+    using fn_ns::CbrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CbrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector);
+
+    using fn_ns::CbrtTypeMapFactory;
+    DispatchVectorBuilder<int, CbrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cbrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cbrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cbrt_dispatch_vectors();
+        using impl::cbrt_contig_dispatch_vector;
+        using impl::cbrt_output_typeid_vector;
+        using impl::cbrt_strided_dispatch_vector;
+
+        auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cbrt_output_typeid_vector,
+                cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector);
+        };
+        m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector);
+        };
+        m.def("_cbrt_result_type", cbrt_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp
new file mode 100644
index 0000000000..74da1de81a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_cbrt(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp
new file mode 100644
index 0000000000..f1bb362c5b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "ceil.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/ceil.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U09: ==== CEIL   (x)
+namespace impl
+{
+
+namespace ceil_fn_ns = dpctl::tensor::kernels::ceil;
+
+static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types];
+static int ceil_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    ceil_strided_dispatch_vector[td_ns::num_types];
+
+void populate_ceil_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = ceil_fn_ns;
+
+    using fn_ns::CeilContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CeilContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector);
+
+    using fn_ns::CeilStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CeilStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector);
+
+    using fn_ns::CeilTypeMapFactory;
+    DispatchVectorBuilder<int, CeilTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(ceil_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_ceil(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_ceil_dispatch_vectors();
+        using impl::ceil_contig_dispatch_vector;
+        using impl::ceil_output_typeid_vector;
+        using impl::ceil_strided_dispatch_vector;
+
+        auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, ceil_output_typeid_vector,
+                ceil_contig_dispatch_vector, ceil_strided_dispatch_vector);
+        };
+        m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto ceil_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector);
+        };
+        m.def("_ceil_result_type", ceil_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp
new file mode 100644
index 0000000000..4a6caf999b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp
@@ -0,0 +1,44 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_ceil(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp
new file mode 100644
index 0000000000..cac84e63fb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "conj.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/conj.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U10: ==== CONJ   (x)
+namespace impl
+{
+
+namespace conj_fn_ns = dpctl::tensor::kernels::conj;
+
+static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types];
+static int conj_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    conj_strided_dispatch_vector[td_ns::num_types];
+
+void populate_conj_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = conj_fn_ns;
+
+    using fn_ns::ConjContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ConjContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(conj_contig_dispatch_vector);
+
+    using fn_ns::ConjStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ConjStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(conj_strided_dispatch_vector);
+
+    using fn_ns::ConjTypeMapFactory;
+    DispatchVectorBuilder<int, ConjTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(conj_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_conj(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_conj_dispatch_vectors();
+        using impl::conj_contig_dispatch_vector;
+        using impl::conj_output_typeid_vector;
+        using impl::conj_strided_dispatch_vector;
+
+        auto conj_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, conj_output_typeid_vector,
+                conj_contig_dispatch_vector, conj_strided_dispatch_vector);
+        };
+        m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto conj_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector);
+        };
+        m.def("_conj_result_type", conj_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp
new file mode 100644
index 0000000000..33d9993019
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_conj(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp
new file mode 100644
index 0000000000..6a887e0345
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "copysign.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/copysign.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B25: ===== COPYSIGN (x1, x2)
+namespace impl
+{
+namespace copysign_fn_ns = dpctl::tensor::kernels::copysign;
+
+static binary_contig_impl_fn_ptr_t
+    copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int copysign_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_copysign_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = copysign_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::CopysignTypeMapFactory;
+    DispatchTableBuilder<int, CopysignTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(copysign_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::CopysignStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, CopysignStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(copysign_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::CopysignContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, CopysignContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(copysign_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_copysign(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_copysign_dispatch_tables();
+        using impl::copysign_contig_dispatch_table;
+        using impl::copysign_output_id_table;
+        using impl::copysign_strided_dispatch_table;
+
+        auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, copysign_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                copysign_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                copysign_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto copysign_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               copysign_output_id_table);
+        };
+        m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_copysign_result_type", copysign_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp
new file mode 100644
index 0000000000..d22cbdb0f0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_copysign(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp
new file mode 100644
index 0000000000..1986610510
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "cos.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/cos.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U11: ==== COS   (x)
+namespace impl
+{
+
+namespace cos_fn_ns = dpctl::tensor::kernels::cos;
+
+static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types];
+static int cos_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cos_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cos_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cos_fn_ns;
+
+    using fn_ns::CosContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CosContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cos_contig_dispatch_vector);
+
+    using fn_ns::CosStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CosStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cos_strided_dispatch_vector);
+
+    using fn_ns::CosTypeMapFactory;
+    DispatchVectorBuilder<int, CosTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cos_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cos(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cos_dispatch_vectors();
+        using impl::cos_contig_dispatch_vector;
+        using impl::cos_output_typeid_vector;
+        using impl::cos_strided_dispatch_vector;
+
+        auto cos_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cos_output_typeid_vector,
+                cos_contig_dispatch_vector, cos_strided_dispatch_vector);
+        };
+        m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cos_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector);
+        };
+        m.def("_cos_result_type", cos_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp
new file mode 100644
index 0000000000..1753058024
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_cos(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp
new file mode 100644
index 0000000000..0bb74df979
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "cosh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/cosh.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U12: ==== COSH   (x)
+namespace impl
+{
+
+namespace cosh_fn_ns = dpctl::tensor::kernels::cosh;
+
+static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types];
+static int cosh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cosh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cosh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cosh_fn_ns;
+
+    using fn_ns::CoshContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CoshContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector);
+
+    using fn_ns::CoshStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CoshStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector);
+
+    using fn_ns::CoshTypeMapFactory;
+    DispatchVectorBuilder<int, CoshTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cosh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cosh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cosh_dispatch_vectors();
+        using impl::cosh_contig_dispatch_vector;
+        using impl::cosh_output_typeid_vector;
+        using impl::cosh_strided_dispatch_vector;
+
+        auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cosh_output_typeid_vector,
+                cosh_contig_dispatch_vector, cosh_strided_dispatch_vector);
+        };
+        m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cosh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector);
+        };
+        m.def("_cosh_result_type", cosh_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp
new file mode 100644
index 0000000000..c1eba05ea5
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_cosh(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
new file mode 100644
index 0000000000..751e44ff55
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -0,0 +1,181 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "abs.hpp"
+#include "acos.hpp"
+#include "acosh.hpp"
+#include "add.hpp"
+#include "asin.hpp"
+#include "asinh.hpp"
+#include "atan.hpp"
+#include "atan2.hpp"
+#include "atanh.hpp"
+#include "bitwise_and.hpp"
+#include "bitwise_invert.hpp"
+#include "bitwise_left_shift.hpp"
+#include "bitwise_or.hpp"
+#include "bitwise_right_shift.hpp"
+#include "bitwise_xor.hpp"
+#include "cbrt.hpp"
+#include "ceil.hpp"
+#include "conj.hpp"
+#include "copysign.hpp"
+#include "cos.hpp"
+#include "cosh.hpp"
+#include "equal.hpp"
+#include "exp.hpp"
+#include "exp2.hpp"
+#include "expm1.hpp"
+#include "floor.hpp"
+#include "floor_divide.hpp"
+#include "greater.hpp"
+#include "greater_equal.hpp"
+#include "hypot.hpp"
+#include "imag.hpp"
+#include "isfinite.hpp"
+#include "isinf.hpp"
+#include "isnan.hpp"
+#include "less.hpp"
+#include "less_equal.hpp"
+#include "log.hpp"
+#include "log10.hpp"
+#include "log1p.hpp"
+#include "log2.hpp"
+#include "logaddexp.hpp"
+#include "logical_and.hpp"
+#include "logical_not.hpp"
+#include "logical_or.hpp"
+#include "logical_xor.hpp"
+#include "maximum.hpp"
+#include "minimum.hpp"
+#include "multiply.hpp"
+#include "negative.hpp"
+#include "not_equal.hpp"
+#include "positive.hpp"
+#include "pow.hpp"
+#include "proj.hpp"
+#include "real.hpp"
+#include "remainder.hpp"
+#include "round.hpp"
+#include "rsqrt.hpp"
+#include "sign.hpp"
+#include "signbit.hpp"
+#include "sin.hpp"
+#include "sinh.hpp"
+#include "sqrt.hpp"
+#include "square.hpp"
+#include "subtract.hpp"
+#include "tan.hpp"
+#include "tanh.hpp"
+#include "true_divide.hpp"
+#include "trunc.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+/*! @brief Add elementwise functions to Python module */
+void init_elementwise_functions(py::module_ m)
+{
+    init_abs(m);
+    init_acos(m);
+    init_acosh(m);
+    init_add(m);
+    init_asin(m);
+    init_asinh(m);
+    init_atan(m);
+    init_atan2(m);
+    init_atanh(m);
+    init_bitwise_and(m);
+    init_bitwise_invert(m);
+    init_bitwise_left_shift(m);
+    init_bitwise_or(m);
+    init_bitwise_right_shift(m);
+    init_bitwise_xor(m);
+    init_cbrt(m);
+    init_ceil(m);
+    init_conj(m);
+    init_copysign(m);
+    init_cos(m);
+    init_cosh(m);
+    init_divide(m);
+    init_equal(m);
+    init_exp(m);
+    init_exp2(m);
+    init_expm1(m);
+    init_floor(m);
+    init_floor_divide(m);
+    init_greater(m);
+    init_greater_equal(m);
+    init_hypot(m);
+    init_imag(m);
+    init_isfinite(m);
+    init_isinf(m);
+    init_isnan(m);
+    init_less(m);
+    init_less_equal(m);
+    init_log(m);
+    init_log10(m);
+    init_log1p(m);
+    init_log2(m);
+    init_logaddexp(m);
+    init_logical_and(m);
+    init_logical_not(m);
+    init_logical_or(m);
+    init_logical_xor(m);
+    init_maximum(m);
+    init_minimum(m);
+    init_multiply(m);
+    init_negative(m);
+    init_not_equal(m);
+    init_positive(m);
+    init_pow(m);
+    init_proj(m);
+    init_real(m);
+    init_remainder(m);
+    init_round(m);
+    init_rsqrt(m);
+    init_sign(m);
+    init_signbit(m);
+    init_sin(m);
+    init_sinh(m);
+    init_sqrt(m);
+    init_square(m);
+    init_subtract(m);
+    init_tan(m);
+    init_tanh(m);
+    init_trunc(m);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
new file mode 100644
index 0000000000..ef9182f9a2
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_elementwise_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
similarity index 97%
rename from dpctl/tensor/libtensor/source/elementwise_functions.hpp
rename to dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index 523e4259c3..6817a3541c 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -22,7 +22,6 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions,
 /// specifically functions for elementwise operations.
 //===----------------------------------------------------------------------===//
-
 #pragma once
 
 #include "dpctl4pybind11.hpp"
@@ -30,14 +29,17 @@
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
-#include <utility>
 #include <vector>
 
+#include "elementwise_functions_type_utils.hpp"
 #include "simplify_iteration_space.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch.hpp"
 
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
 namespace dpctl
 {
 namespace tensor
@@ -45,11 +47,7 @@ namespace tensor
 namespace py_internal
 {
 
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-extern py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t);
-extern int _result_typeid(int arg_typeid, const int *fn_output_id);
-
+/*! @brief Template implementing Python API for unary elementwise functions */
 template <typename output_typesT,
           typename contig_dispatchT,
           typename strided_dispatchT>
@@ -251,6 +249,8 @@ py_unary_ufunc(const dpctl::tensor::usm_ndarray &src,
         strided_fn_ev);
 }
 
+/*! @brief Template implementing Python API for querying of type support by
+ *         unary elementwise functions */
 template <typename output_typesT>
 py::object py_unary_ufunc_result_type(const py::dtype &input_dtype,
                                       const output_typesT &output_types)
@@ -266,6 +266,7 @@ py::object py_unary_ufunc_result_type(const py::dtype &input_dtype,
         throw py::value_error(e.what());
     }
 
+    using dpctl::tensor::py_internal::type_utils::_result_typeid;
     int dst_typeid = _result_typeid(src_typeid, output_types);
 
     if (dst_typeid < 0) {
@@ -273,8 +274,9 @@ py::object py_unary_ufunc_result_type(const py::dtype &input_dtype,
         return py::cast<py::object>(res);
     }
     else {
-        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum;
 
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
         auto dt = _dtype_from_typenum(dst_typenum_t);
 
         return py::cast<py::object>(dt);
@@ -292,6 +294,8 @@ bool isEqual(Container const &c, std::initializer_list<T> const &l)
 }
 } // namespace
 
+/*! @brief Template implementing Python API for binary elementwise
+ *         functions */
 template <typename output_typesT,
           typename contig_dispatchT,
           typename strided_dispatchT,
@@ -559,6 +563,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
         strided_fn_ev);
 }
 
+/*! @brief Type querying for binary elementwise functions */
 template <typename output_typesT>
 py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
                                        const py::dtype &input2_dtype,
@@ -590,8 +595,9 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
         return py::cast<py::object>(res);
     }
     else {
-        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum;
 
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
         auto dt = _dtype_from_typenum(dst_typenum_t);
 
         return py::cast<py::object>(dt);
@@ -825,8 +831,6 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs,
         strided_fn_ev);
 }
 
-extern void init_elementwise_functions(py::module_ m);
-
 } // namespace py_internal
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
new file mode 100644
index 0000000000..473048e8fa
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -0,0 +1,95 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions for looking of supported types in elementwise
+/// functions.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "elementwise_functions_type_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+namespace type_utils
+{
+
+py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t)
+{
+    switch (dst_typenum_t) {
+    case td_ns::typenum_t::BOOL:
+        return py::dtype("?");
+    case td_ns::typenum_t::INT8:
+        return py::dtype("i1");
+    case td_ns::typenum_t::UINT8:
+        return py::dtype("u1");
+    case td_ns::typenum_t::INT16:
+        return py::dtype("i2");
+    case td_ns::typenum_t::UINT16:
+        return py::dtype("u2");
+    case td_ns::typenum_t::INT32:
+        return py::dtype("i4");
+    case td_ns::typenum_t::UINT32:
+        return py::dtype("u4");
+    case td_ns::typenum_t::INT64:
+        return py::dtype("i8");
+    case td_ns::typenum_t::UINT64:
+        return py::dtype("u8");
+    case td_ns::typenum_t::HALF:
+        return py::dtype("f2");
+    case td_ns::typenum_t::FLOAT:
+        return py::dtype("f4");
+    case td_ns::typenum_t::DOUBLE:
+        return py::dtype("f8");
+    case td_ns::typenum_t::CFLOAT:
+        return py::dtype("c8");
+    case td_ns::typenum_t::CDOUBLE:
+        return py::dtype("c16");
+    default:
+        throw py::value_error("Unrecognized dst_typeid");
+    }
+}
+
+int _result_typeid(int arg_typeid, const int *fn_output_id)
+{
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) {
+        throw py::value_error("Input typeid " + std::to_string(arg_typeid) +
+                              " is outside of expected bounds.");
+    }
+
+    return fn_output_id[arg_typeid];
+}
+
+} // namespace type_utils
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
new file mode 100644
index 0000000000..6dac195dc2
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
@@ -0,0 +1,56 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares functions for looking of supported types in elementwise
+/// functions.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+namespace type_utils
+{
+
+/*! @brief Produce dtype from a type number */
+extern py::dtype _dtype_from_typenum(td_ns::typenum_t);
+
+/*! @brief Lookup typeid of the result from typeid of
+ *         argument and the mapping table */
+extern int _result_typeid(int, const int *);
+
+} // namespace type_utils
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp
new file mode 100644
index 0000000000..f36ec1b446
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/equal.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B09: ===== EQUAL (x1, x2)
+namespace impl
+{
+namespace equal_fn_ns = dpctl::tensor::kernels::equal;
+
+static binary_contig_impl_fn_ptr_t
+    equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::EqualTypeMapFactory;
+    DispatchTableBuilder<int, EqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::EqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, EqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::EqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, EqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_equal_dispatch_tables();
+        using impl::equal_contig_dispatch_table;
+        using impl::equal_output_id_table;
+        using impl::equal_strided_dispatch_table;
+
+        auto equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               equal_output_id_table);
+        };
+        m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_equal_result_type", equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp
new file mode 100644
index 0000000000..21ac4ad6b4
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_equal(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp
new file mode 100644
index 0000000000..51ccaaac70
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "exp.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/exp.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U13: ==== EXP   (x)
+namespace impl
+{
+
+namespace exp_fn_ns = dpctl::tensor::kernels::exp;
+
+static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types];
+static int exp_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp_fn_ns;
+
+    using fn_ns::ExpContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ExpContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp_contig_dispatch_vector);
+
+    using fn_ns::ExpStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ExpStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp_strided_dispatch_vector);
+
+    using fn_ns::ExpTypeMapFactory;
+    DispatchVectorBuilder<int, ExpTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_exp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_exp_dispatch_vectors();
+        using impl::exp_contig_dispatch_vector;
+        using impl::exp_output_typeid_vector;
+        using impl::exp_strided_dispatch_vector;
+
+        auto exp_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp_output_typeid_vector,
+                exp_contig_dispatch_vector, exp_strided_dispatch_vector);
+        };
+        m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector);
+        };
+        m.def("_exp_result_type", exp_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp
new file mode 100644
index 0000000000..7227f0a2dc
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_exp(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp
new file mode 100644
index 0000000000..438ad0800e
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "exp2.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/exp2.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U38: ==== EXP2   (x)
+namespace impl
+{
+
+namespace exp2_fn_ns = dpctl::tensor::kernels::exp2;
+
+static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types];
+static int exp2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp2_fn_ns;
+
+    using fn_ns::Exp2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Exp2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector);
+
+    using fn_ns::Exp2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Exp2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector);
+
+    using fn_ns::Exp2TypeMapFactory;
+    DispatchVectorBuilder<int, Exp2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp2_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_exp2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_exp2_dispatch_vectors();
+        using impl::exp2_contig_dispatch_vector;
+        using impl::exp2_output_typeid_vector;
+        using impl::exp2_strided_dispatch_vector;
+
+        auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp2_output_typeid_vector,
+                exp2_contig_dispatch_vector, exp2_strided_dispatch_vector);
+        };
+        m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector);
+        };
+        m.def("_exp2_result_type", exp2_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp
new file mode 100644
index 0000000000..be041e1f8d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_exp2(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp
new file mode 100644
index 0000000000..3b9332c4f1
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "expm1.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/expm1.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U14: ==== EXPM1   (x)
+namespace impl
+{
+
+namespace expm1_fn_ns = dpctl::tensor::kernels::expm1;
+
+static unary_contig_impl_fn_ptr_t
+    expm1_contig_dispatch_vector[td_ns::num_types];
+static int expm1_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    expm1_strided_dispatch_vector[td_ns::num_types];
+
+void populate_expm1_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = expm1_fn_ns;
+
+    using fn_ns::Expm1ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Expm1ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector);
+
+    using fn_ns::Expm1StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Expm1StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector);
+
+    using fn_ns::Expm1TypeMapFactory;
+    DispatchVectorBuilder<int, Expm1TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(expm1_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_expm1(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_expm1_dispatch_vectors();
+        using impl::expm1_contig_dispatch_vector;
+        using impl::expm1_output_typeid_vector;
+        using impl::expm1_strided_dispatch_vector;
+
+        auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, expm1_output_typeid_vector,
+                expm1_contig_dispatch_vector, expm1_strided_dispatch_vector);
+        };
+        m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto expm1_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              expm1_output_typeid_vector);
+        };
+        m.def("_expm1_result_type", expm1_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp
new file mode 100644
index 0000000000..6e39644835
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_expm1(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp
new file mode 100644
index 0000000000..9ccf89f13a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "floor.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/floor.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U15: ==== FLOOR   (x)
+namespace impl
+{
+
+namespace floor_fn_ns = dpctl::tensor::kernels::floor;
+
+static unary_contig_impl_fn_ptr_t
+    floor_contig_dispatch_vector[td_ns::num_types];
+static int floor_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    floor_strided_dispatch_vector[td_ns::num_types];
+
+void populate_floor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = floor_fn_ns;
+
+    using fn_ns::FloorContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, FloorContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(floor_contig_dispatch_vector);
+
+    using fn_ns::FloorStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, FloorStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(floor_strided_dispatch_vector);
+
+    using fn_ns::FloorTypeMapFactory;
+    DispatchVectorBuilder<int, FloorTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(floor_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_floor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_floor_dispatch_vectors();
+        using impl::floor_contig_dispatch_vector;
+        using impl::floor_output_typeid_vector;
+        using impl::floor_strided_dispatch_vector;
+
+        auto floor_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, floor_output_typeid_vector,
+                floor_contig_dispatch_vector, floor_strided_dispatch_vector);
+        };
+        m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto floor_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              floor_output_typeid_vector);
+        };
+        m.def("_floor_result_type", floor_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp
new file mode 100644
index 0000000000..b742b058ad
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_floor(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
new file mode 100644
index 0000000000..e75fc56c67
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
@@ -0,0 +1,190 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "floor_divide.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/floor_divide.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B10: ===== FLOOR_DIVIDE (x1, x2)
+namespace impl
+{
+namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide;
+
+static binary_contig_impl_fn_ptr_t
+    floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    floor_divide_inplace_contig_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    floor_divide_inplace_strided_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+void populate_floor_divide_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = floor_divide_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::FloorDivideTypeMapFactory;
+    DispatchTableBuilder<int, FloorDivideTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(floor_divide_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::FloorDivideStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         FloorDivideStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::FloorDivideContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, FloorDivideContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::FloorDivideInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         FloorDivideInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::FloorDivideInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         FloorDivideInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_floor_divide(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_floor_divide_dispatch_tables();
+        using impl::floor_divide_contig_dispatch_table;
+        using impl::floor_divide_output_id_table;
+        using impl::floor_divide_strided_dispatch_table;
+
+        auto floor_divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                      const arrayT &dst, sycl::queue &exec_q,
+                                      const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, floor_divide_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                floor_divide_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                floor_divide_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                  const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               floor_divide_output_id_table);
+        };
+        m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, "");
+
+        using impl::floor_divide_inplace_contig_dispatch_table;
+        using impl::floor_divide_inplace_strided_dispatch_table;
+
+        auto floor_divide_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends, floor_divide_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    floor_divide_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    floor_divide_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
new file mode 100644
index 0000000000..c7f0d40dcc
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_floor_divide(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp
new file mode 100644
index 0000000000..f79102df47
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "greater.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/greater.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B11: ===== GREATER (x1, x2)
+namespace impl
+{
+namespace greater_fn_ns = dpctl::tensor::kernels::greater;
+
+static binary_contig_impl_fn_ptr_t
+    greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int greater_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_greater_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = greater_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::GreaterTypeMapFactory;
+    DispatchTableBuilder<int, GreaterTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(greater_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::GreaterStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, GreaterStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(greater_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::GreaterContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(greater_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_greater(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_greater_dispatch_tables();
+        using impl::greater_contig_dispatch_table;
+        using impl::greater_output_id_table;
+        using impl::greater_strided_dispatch_table;
+
+        auto greater_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, greater_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                greater_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                greater_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto greater_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               greater_output_id_table);
+        };
+        m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_greater_result_type", greater_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp
new file mode 100644
index 0000000000..ba8dc57bb0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_greater(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
new file mode 100644
index 0000000000..005679c3fb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
@@ -0,0 +1,141 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "greater_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/greater_equal.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B12: ===== GREATER_EQUAL (x1, x2)
+namespace impl
+{
+namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal;
+
+static binary_contig_impl_fn_ptr_t
+    greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_greater_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = greater_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::GreaterEqualTypeMapFactory;
+    DispatchTableBuilder<int, GreaterEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(greater_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::GreaterEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         GreaterEqualStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::GreaterEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_greater_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_greater_equal_dispatch_tables();
+        using impl::greater_equal_contig_dispatch_table;
+        using impl::greater_equal_output_id_table;
+        using impl::greater_equal_strided_dispatch_table;
+
+        auto greater_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                       const arrayT &dst, sycl::queue &exec_q,
+                                       const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, greater_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                greater_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                greater_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                   const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               greater_equal_output_id_table);
+        };
+        m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_greater_equal_result_type", greater_equal_result_type_pyapi,
+              "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
new file mode 100644
index 0000000000..2cf116566e
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_greater_equal(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp
new file mode 100644
index 0000000000..2442710198
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "hypot.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/hypot.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B24: ===== HYPOT (x1, x2)
+namespace impl
+{
+namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
+
+static binary_contig_impl_fn_ptr_t
+    hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int hypot_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_hypot_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = hypot_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::HypotTypeMapFactory;
+    DispatchTableBuilder<int, HypotTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(hypot_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::HypotStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, HypotStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::HypotContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, HypotContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_hypot(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_hypot_dispatch_tables();
+        using impl::hypot_contig_dispatch_table;
+        using impl::hypot_output_id_table;
+        using impl::hypot_strided_dispatch_table;
+
+        auto hypot_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, hypot_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                hypot_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                hypot_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto hypot_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               hypot_output_id_table);
+        };
+        m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_hypot_result_type", hypot_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp
new file mode 100644
index 0000000000..2d154917ea
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_hypot(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp
new file mode 100644
index 0000000000..4012b9206f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "imag.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/imag.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U16: ==== IMAG   (x)
+namespace impl
+{
+
+namespace imag_fn_ns = dpctl::tensor::kernels::imag;
+
+static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types];
+static int imag_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    imag_strided_dispatch_vector[td_ns::num_types];
+
+void populate_imag_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = imag_fn_ns;
+
+    using fn_ns::ImagContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ImagContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(imag_contig_dispatch_vector);
+
+    using fn_ns::ImagStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ImagStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(imag_strided_dispatch_vector);
+
+    using fn_ns::ImagTypeMapFactory;
+    DispatchVectorBuilder<int, ImagTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(imag_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_imag(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_imag_dispatch_vectors();
+        using impl::imag_contig_dispatch_vector;
+        using impl::imag_output_typeid_vector;
+        using impl::imag_strided_dispatch_vector;
+
+        auto imag_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, imag_output_typeid_vector,
+                imag_contig_dispatch_vector, imag_strided_dispatch_vector);
+        };
+        m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto imag_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector);
+        };
+        m.def("_imag_result_type", imag_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp
new file mode 100644
index 0000000000..ffac3f2465
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_isfinite(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp
new file mode 100644
index 0000000000..73a2be4010
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp
@@ -0,0 +1,122 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "isfinite.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isfinite.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U17: ==== ISFINITE   (x)
+namespace impl
+{
+
+namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite;
+
+static unary_contig_impl_fn_ptr_t
+    isfinite_contig_dispatch_vector[td_ns::num_types];
+static int isfinite_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isfinite_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isfinite_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isfinite_fn_ns;
+
+    using fn_ns::IsFiniteContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsFiniteContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector);
+
+    using fn_ns::IsFiniteStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsFiniteStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector);
+
+    using fn_ns::IsFiniteTypeMapFactory;
+    DispatchVectorBuilder<int, IsFiniteTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isfinite_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isfinite(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isfinite_dispatch_vectors();
+        using impl::isfinite_contig_dispatch_vector;
+        using impl::isfinite_output_typeid_vector;
+        using impl::isfinite_strided_dispatch_vector;
+
+        auto isfinite_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  isfinite_output_typeid_vector,
+                                  isfinite_contig_dispatch_vector,
+                                  isfinite_strided_dispatch_vector);
+        };
+        m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isfinite_output_typeid_vector);
+        };
+        m.def("_isfinite_result_type", isfinite_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp
new file mode 100644
index 0000000000..fd7508792b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_imag(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp
new file mode 100644
index 0000000000..2600fe4f74
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "isinf.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isinf.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U18: ==== ISINF   (x)
+namespace impl
+{
+
+namespace isinf_fn_ns = dpctl::tensor::kernels::isinf;
+
+static unary_contig_impl_fn_ptr_t
+    isinf_contig_dispatch_vector[td_ns::num_types];
+static int isinf_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isinf_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isinf_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isinf_fn_ns;
+
+    using fn_ns::IsInfContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsInfContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector);
+
+    using fn_ns::IsInfStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsInfStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector);
+
+    using fn_ns::IsInfTypeMapFactory;
+    DispatchVectorBuilder<int, IsInfTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isinf_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isinf(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isinf_dispatch_vectors();
+        using impl::isinf_contig_dispatch_vector;
+        using impl::isinf_output_typeid_vector;
+        using impl::isinf_strided_dispatch_vector;
+
+        auto isinf_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, isinf_output_typeid_vector,
+                isinf_contig_dispatch_vector, isinf_strided_dispatch_vector);
+        };
+        m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isinf_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isinf_output_typeid_vector);
+        };
+        m.def("_isinf_result_type", isinf_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp
new file mode 100644
index 0000000000..8c3cd51c91
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_isinf(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp
new file mode 100644
index 0000000000..b75618c5e0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "isnan.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isnan.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U19: ==== ISNAN   (x)
+namespace impl
+{
+
+namespace isnan_fn_ns = dpctl::tensor::kernels::isnan;
+
+static unary_contig_impl_fn_ptr_t
+    isnan_contig_dispatch_vector[td_ns::num_types];
+static int isnan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isnan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isnan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isnan_fn_ns;
+
+    using fn_ns::IsNanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsNanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector);
+
+    using fn_ns::IsNanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsNanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector);
+
+    using fn_ns::IsNanTypeMapFactory;
+    DispatchVectorBuilder<int, IsNanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isnan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isnan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isnan_dispatch_vectors();
+        using impl::isnan_contig_dispatch_vector;
+        using impl::isnan_output_typeid_vector;
+        using impl::isnan_strided_dispatch_vector;
+
+        auto isnan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, isnan_output_typeid_vector,
+                isnan_contig_dispatch_vector, isnan_strided_dispatch_vector);
+        };
+        m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isnan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isnan_output_typeid_vector);
+        };
+        m.def("_isnan_result_type", isnan_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp
new file mode 100644
index 0000000000..df1f41d47f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_isnan(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp
new file mode 100644
index 0000000000..c34122d862
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "less.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/less.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B13: ===== LESS (x1, x2)
+namespace impl
+{
+namespace less_fn_ns = dpctl::tensor::kernels::less;
+
+static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types]
+                                                             [td_ns::num_types];
+static int less_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    less_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_less_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = less_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LessTypeMapFactory;
+    DispatchTableBuilder<int, LessTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(less_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LessStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(less_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LessContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(less_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_less(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_less_dispatch_tables();
+        using impl::less_contig_dispatch_table;
+        using impl::less_output_id_table;
+        using impl::less_strided_dispatch_table;
+
+        auto less_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, less_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                less_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                less_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto less_result_type_pyapi = [&](const py::dtype &dtype1,
+                                          const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               less_output_id_table);
+        };
+        m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_less_result_type", less_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp
new file mode 100644
index 0000000000..dada4b4be7
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_less(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp
new file mode 100644
index 0000000000..712b30d902
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "less_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/less_equal.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B14: ===== LESS_EQUAL (x1, x2)
+namespace impl
+{
+namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal;
+
+static binary_contig_impl_fn_ptr_t
+    less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_less_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = less_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LessEqualTypeMapFactory;
+    DispatchTableBuilder<int, LessEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(less_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LessEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessEqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(less_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LessEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(less_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_less_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_less_equal_dispatch_tables();
+        using impl::less_equal_contig_dispatch_table;
+        using impl::less_equal_output_id_table;
+        using impl::less_equal_strided_dispatch_table;
+
+        auto less_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, less_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                less_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                less_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               less_equal_output_id_table);
+        };
+        m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_less_equal_result_type", less_equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp
new file mode 100644
index 0000000000..e52ee3b940
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_less_equal(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp
new file mode 100644
index 0000000000..f73b9e2414
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "log.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U20: ==== LOG   (x)
+namespace impl
+{
+
+namespace log_fn_ns = dpctl::tensor::kernels::log;
+
+static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types];
+static int log_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log_fn_ns;
+
+    using fn_ns::LogContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log_contig_dispatch_vector);
+
+    using fn_ns::LogStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log_strided_dispatch_vector);
+
+    using fn_ns::LogTypeMapFactory;
+    DispatchVectorBuilder<int, LogTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log_dispatch_vectors();
+        using impl::log_contig_dispatch_vector;
+        using impl::log_output_typeid_vector;
+        using impl::log_strided_dispatch_vector;
+
+        auto log_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log_output_typeid_vector,
+                log_contig_dispatch_vector, log_strided_dispatch_vector);
+        };
+        m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, log_output_typeid_vector);
+        };
+        m.def("_log_result_type", log_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp
new file mode 100644
index 0000000000..1ca152d174
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_log(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp
new file mode 100644
index 0000000000..566dfcbcf7
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "log10.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log10.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U23: ==== LOG10   (x)
+namespace impl
+{
+
+namespace log10_fn_ns = dpctl::tensor::kernels::log10;
+
+static unary_contig_impl_fn_ptr_t
+    log10_contig_dispatch_vector[td_ns::num_types];
+static int log10_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log10_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log10_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log10_fn_ns;
+
+    using fn_ns::Log10ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log10ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log10_contig_dispatch_vector);
+
+    using fn_ns::Log10StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log10StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log10_strided_dispatch_vector);
+
+    using fn_ns::Log10TypeMapFactory;
+    DispatchVectorBuilder<int, Log10TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log10_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log10(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log10_dispatch_vectors();
+        using impl::log10_contig_dispatch_vector;
+        using impl::log10_output_typeid_vector;
+        using impl::log10_strided_dispatch_vector;
+
+        auto log10_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log10_output_typeid_vector,
+                log10_contig_dispatch_vector, log10_strided_dispatch_vector);
+        };
+        m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log10_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              log10_output_typeid_vector);
+        };
+        m.def("_log10_result_type", log10_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp
new file mode 100644
index 0000000000..3972695849
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_log10(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp
new file mode 100644
index 0000000000..badb474778
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "log1p.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log1p.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U21: ==== LOG1P   (x)
+namespace impl
+{
+
+namespace log1p_fn_ns = dpctl::tensor::kernels::log1p;
+
+static unary_contig_impl_fn_ptr_t
+    log1p_contig_dispatch_vector[td_ns::num_types];
+static int log1p_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log1p_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log1p_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log1p_fn_ns;
+
+    using fn_ns::Log1pContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log1pContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector);
+
+    using fn_ns::Log1pStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log1pStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector);
+
+    using fn_ns::Log1pTypeMapFactory;
+    DispatchVectorBuilder<int, Log1pTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log1p_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log1p(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log1p_dispatch_vectors();
+        using impl::log1p_contig_dispatch_vector;
+        using impl::log1p_output_typeid_vector;
+        using impl::log1p_strided_dispatch_vector;
+
+        auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log1p_output_typeid_vector,
+                log1p_contig_dispatch_vector, log1p_strided_dispatch_vector);
+        };
+        m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log1p_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              log1p_output_typeid_vector);
+        };
+        m.def("_log1p_result_type", log1p_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp
new file mode 100644
index 0000000000..438b93601c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_log1p(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp
new file mode 100644
index 0000000000..b5a8a39684
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "log2.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log2.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U22: ==== LOG2   (x)
+namespace impl
+{
+
+namespace log2_fn_ns = dpctl::tensor::kernels::log2;
+
+static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types];
+static int log2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log2_fn_ns;
+
+    using fn_ns::Log2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log2_contig_dispatch_vector);
+
+    using fn_ns::Log2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log2_strided_dispatch_vector);
+
+    using fn_ns::Log2TypeMapFactory;
+    DispatchVectorBuilder<int, Log2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log2_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log2_dispatch_vectors();
+        using impl::log2_contig_dispatch_vector;
+        using impl::log2_output_typeid_vector;
+        using impl::log2_strided_dispatch_vector;
+
+        auto log2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log2_output_typeid_vector,
+                log2_contig_dispatch_vector, log2_strided_dispatch_vector);
+        };
+        m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector);
+        };
+        m.def("_log2_result_type", log2_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp
new file mode 100644
index 0000000000..4e47ed369a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_log2(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
new file mode 100644
index 0000000000..77ded230be
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "logaddexp.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logaddexp.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B15: ===== LOGADDEXP (x1, x2)
+namespace impl
+{
+namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp;
+
+static binary_contig_impl_fn_ptr_t
+    logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logaddexp_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logaddexp_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogAddExpTypeMapFactory;
+    DispatchTableBuilder<int, LogAddExpTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logaddexp_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogAddExpStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogAddExpStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogAddExpContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogAddExpContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logaddexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logaddexp_dispatch_tables();
+        using impl::logaddexp_contig_dispatch_table;
+        using impl::logaddexp_output_id_table;
+        using impl::logaddexp_strided_dispatch_table;
+
+        auto logaddexp_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logaddexp_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logaddexp_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logaddexp_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logaddexp_output_id_table);
+        };
+        m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
new file mode 100644
index 0000000000..6601b3f9c5
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_logaddexp(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp
new file mode 100644
index 0000000000..4c573ce508
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "logical_and.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_and.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B16: ===== LOGICAL_AND (x1, x2)
+namespace impl
+{
+namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and;
+
+static binary_contig_impl_fn_ptr_t
+    logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalAndTypeMapFactory;
+    DispatchTableBuilder<int, LogicalAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_and_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_and(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_and_dispatch_tables();
+        using impl::logical_and_contig_dispatch_table;
+        using impl::logical_and_output_id_table;
+        using impl::logical_and_strided_dispatch_table;
+
+        auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_and_output_id_table);
+        };
+        m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_and_result_type", logical_and_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp
new file mode 100644
index 0000000000..ee73f7c8d5
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_logical_and(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp
new file mode 100644
index 0000000000..84362cd9ce
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp
@@ -0,0 +1,123 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "logical_not.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_not.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U24: ==== LOGICAL_NOT   (x)
+namespace impl
+{
+
+namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not;
+
+static unary_contig_impl_fn_ptr_t
+    logical_not_contig_dispatch_vector[td_ns::num_types];
+static int logical_not_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    logical_not_strided_dispatch_vector[td_ns::num_types];
+
+void populate_logical_not_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_not_fn_ns;
+
+    using fn_ns::LogicalNotContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogicalNotContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector);
+
+    using fn_ns::LogicalNotStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogicalNotStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector);
+
+    using fn_ns::LogicalNotTypeMapFactory;
+    DispatchVectorBuilder<int, LogicalNotTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(logical_not_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_logical_not(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_not_dispatch_vectors();
+        using impl::logical_not_contig_dispatch_vector;
+        using impl::logical_not_output_typeid_vector;
+        using impl::logical_not_strided_dispatch_vector;
+
+        auto logical_not_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  logical_not_output_typeid_vector,
+                                  logical_not_contig_dispatch_vector,
+                                  logical_not_strided_dispatch_vector);
+        };
+        m.def("_logical_not", logical_not_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              logical_not_output_typeid_vector);
+        };
+        m.def("_logical_not_result_type", logical_not_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp
new file mode 100644
index 0000000000..c1a2c393aa
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_logical_not(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp
new file mode 100644
index 0000000000..ebf8251b2e
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "logical_or.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_or.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B17: ===== LOGICAL_OR (x1, x2)
+namespace impl
+{
+namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or;
+
+static binary_contig_impl_fn_ptr_t
+    logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalOrTypeMapFactory;
+    DispatchTableBuilder<int, LogicalOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_or_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_or(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_or_dispatch_tables();
+        using impl::logical_or_contig_dispatch_table;
+        using impl::logical_or_output_id_table;
+        using impl::logical_or_strided_dispatch_table;
+
+        auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_or_output_id_table);
+        };
+        m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_or_result_type", logical_or_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp
new file mode 100644
index 0000000000..00a4ddfcc2
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_logical_xor(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
new file mode 100644
index 0000000000..9488a5615a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "logical_xor.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_xor.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B18: ===== LOGICAL_XOR (x1, x2)
+namespace impl
+{
+namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor;
+
+static binary_contig_impl_fn_ptr_t
+    logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalXorTypeMapFactory;
+    DispatchTableBuilder<int, LogicalXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_xor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_xor_dispatch_tables();
+        using impl::logical_xor_contig_dispatch_table;
+        using impl::logical_xor_output_id_table;
+        using impl::logical_xor_strided_dispatch_table;
+
+        auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_xor_output_id_table);
+        };
+        m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
new file mode 100644
index 0000000000..ad069eb120
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_logical_or(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp
new file mode 100644
index 0000000000..208bdcf47f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "maximum.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/maximum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B26: ===== MAXIMUM (x1, x2)
+namespace impl
+{
+namespace maximum_fn_ns = dpctl::tensor::kernels::maximum;
+
+static binary_contig_impl_fn_ptr_t
+    maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int maximum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_maximum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = maximum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MaximumTypeMapFactory;
+    DispatchTableBuilder<int, MaximumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(maximum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MaximumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MaximumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(maximum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MaximumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MaximumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(maximum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_maximum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_maximum_dispatch_tables();
+        using impl::maximum_contig_dispatch_table;
+        using impl::maximum_output_id_table;
+        using impl::maximum_strided_dispatch_table;
+
+        auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, maximum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                maximum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                maximum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto maximum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               maximum_output_id_table);
+        };
+        m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_maximum_result_type", maximum_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp
new file mode 100644
index 0000000000..0f49850567
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_maximum(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp
new file mode 100644
index 0000000000..dc1a826ac4
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "minimum.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/minimum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B27: ===== MINIMUM (x1, x2)
+namespace impl
+{
+namespace minimum_fn_ns = dpctl::tensor::kernels::minimum;
+
+static binary_contig_impl_fn_ptr_t
+    minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int minimum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_minimum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = minimum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MinimumTypeMapFactory;
+    DispatchTableBuilder<int, MinimumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(minimum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MinimumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MinimumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(minimum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MinimumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MinimumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(minimum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_minimum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_minimum_dispatch_tables();
+        using impl::minimum_contig_dispatch_table;
+        using impl::minimum_output_id_table;
+        using impl::minimum_strided_dispatch_table;
+
+        auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, minimum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                minimum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                minimum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto minimum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               minimum_output_id_table);
+        };
+        m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_minimum_result_type", minimum_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp
new file mode 100644
index 0000000000..f1f2467c1e
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_minimum(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp
new file mode 100644
index 0000000000..c087abd9ff
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp
@@ -0,0 +1,230 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "multiply.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/multiply.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B19: ===== MULTIPLY (x1, x2)
+namespace impl
+{
+
+namespace multiply_fn_ns = dpctl::tensor::kernels::multiply;
+
+static binary_contig_impl_fn_ptr_t
+    multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int multiply_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// mul(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    multiply_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// mul(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    multiply_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    multiply_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_multiply_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = multiply_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MultiplyTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(multiply_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MultiplyStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MultiplyStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(multiply_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MultiplyContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MultiplyContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(multiply_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        MultiplyContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        multiply_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        MultiplyContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::MultiplyInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         MultiplyInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::MultiplyInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         MultiplyInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         MultiplyInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table);
+};
+
+} // namespace impl
+
+void init_multiply(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_multiply_dispatch_tables();
+        using impl::multiply_contig_dispatch_table;
+        using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::multiply_output_id_table;
+        using impl::multiply_strided_dispatch_table;
+
+        auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, multiply_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                multiply_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                multiply_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto multiply_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               multiply_output_id_table);
+        };
+        m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_multiply_result_type", multiply_result_type_pyapi, "");
+
+        using impl::multiply_inplace_contig_dispatch_table;
+        using impl::multiply_inplace_row_matrix_dispatch_table;
+        using impl::multiply_inplace_strided_dispatch_table;
+
+        auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, multiply_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                multiply_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                multiply_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                multiply_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp
new file mode 100644
index 0000000000..e110ecbb20
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_multiply(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp
new file mode 100644
index 0000000000..bc659506d1
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp
@@ -0,0 +1,122 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "negative.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/negative.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U25: ==== NEGATIVE   (x)
+namespace impl
+{
+
+namespace negative_fn_ns = dpctl::tensor::kernels::negative;
+
+static unary_contig_impl_fn_ptr_t
+    negative_contig_dispatch_vector[td_ns::num_types];
+static int negative_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    negative_strided_dispatch_vector[td_ns::num_types];
+
+void populate_negative_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = negative_fn_ns;
+
+    using fn_ns::NegativeContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, NegativeContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(negative_contig_dispatch_vector);
+
+    using fn_ns::NegativeStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, NegativeStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(negative_strided_dispatch_vector);
+
+    using fn_ns::NegativeTypeMapFactory;
+    DispatchVectorBuilder<int, NegativeTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(negative_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_negative(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_negative_dispatch_vectors();
+        using impl::negative_contig_dispatch_vector;
+        using impl::negative_output_typeid_vector;
+        using impl::negative_strided_dispatch_vector;
+
+        auto negative_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  negative_output_typeid_vector,
+                                  negative_contig_dispatch_vector,
+                                  negative_strided_dispatch_vector);
+        };
+        m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto negative_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              negative_output_typeid_vector);
+        };
+        m.def("_negative_result_type", negative_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp
new file mode 100644
index 0000000000..048e481b34
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_negative(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp
new file mode 100644
index 0000000000..a7a3e909cb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp
@@ -0,0 +1,140 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "not_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/not_equal.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B20: ===== NOT_EQUAL (x1, x2)
+namespace impl
+{
+namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal;
+
+static binary_contig_impl_fn_ptr_t
+    not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_not_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = not_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::NotEqualTypeMapFactory;
+    DispatchTableBuilder<int, NotEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(not_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::NotEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NotEqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(not_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::NotEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NotEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(not_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_not_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_not_equal_dispatch_tables();
+        using impl::not_equal_contig_dispatch_table;
+        using impl::not_equal_output_id_table;
+        using impl::not_equal_strided_dispatch_table;
+
+        auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, not_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                not_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                not_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               not_equal_output_id_table);
+        };
+        m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_not_equal_result_type", not_equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp
new file mode 100644
index 0000000000..4e1f654e79
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_not_equal(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp
new file mode 100644
index 0000000000..eaff0794d2
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp
@@ -0,0 +1,122 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "positive.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/positive.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U26: ==== POSITIVE   (x)
+namespace impl
+{
+
+namespace positive_fn_ns = dpctl::tensor::kernels::positive;
+
+static unary_contig_impl_fn_ptr_t
+    positive_contig_dispatch_vector[td_ns::num_types];
+static int positive_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    positive_strided_dispatch_vector[td_ns::num_types];
+
+void populate_positive_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = positive_fn_ns;
+
+    using fn_ns::PositiveContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, PositiveContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(positive_contig_dispatch_vector);
+
+    using fn_ns::PositiveStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, PositiveStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(positive_strided_dispatch_vector);
+
+    using fn_ns::PositiveTypeMapFactory;
+    DispatchVectorBuilder<int, PositiveTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(positive_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_positive(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_positive_dispatch_vectors();
+        using impl::positive_contig_dispatch_vector;
+        using impl::positive_output_typeid_vector;
+        using impl::positive_strided_dispatch_vector;
+
+        auto positive_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  positive_output_typeid_vector,
+                                  positive_contig_dispatch_vector,
+                                  positive_strided_dispatch_vector);
+        };
+        m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto positive_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              positive_output_typeid_vector);
+        };
+        m.def("_positive_result_type", positive_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp
new file mode 100644
index 0000000000..a7b19a07ab
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_positive(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp
new file mode 100644
index 0000000000..a8ef6cb171
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp
@@ -0,0 +1,189 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "pow.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/pow.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B21: ===== POW (x1, x2)
+namespace impl
+{
+
+namespace pow_fn_ns = dpctl::tensor::kernels::pow;
+
+static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+static int pow_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_pow_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = pow_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::PowTypeMapFactory;
+    DispatchTableBuilder<int, PowTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(pow_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::PowStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, PowStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(pow_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::PowContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, PowContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(pow_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::PowInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         PowInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::PowInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         PowInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_pow(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_pow_dispatch_tables();
+        using impl::pow_contig_dispatch_table;
+        using impl::pow_output_id_table;
+        using impl::pow_strided_dispatch_table;
+
+        auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, pow_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                pow_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                pow_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto pow_result_type_pyapi = [&](const py::dtype &dtype1,
+                                         const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               pow_output_id_table);
+        };
+        m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_pow_result_type", pow_result_type_pyapi, "");
+
+        using impl::pow_inplace_contig_dispatch_table;
+        using impl::pow_inplace_strided_dispatch_table;
+
+        auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, pow_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                pow_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                pow_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp
new file mode 100644
index 0000000000..7a13b414eb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_pow(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp
new file mode 100644
index 0000000000..60060084e1
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "proj.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/proj.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U40: ==== PROJ   (x)
+namespace impl
+{
+
+namespace proj_fn_ns = dpctl::tensor::kernels::proj;
+
+static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types];
+static int proj_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    proj_strided_dispatch_vector[td_ns::num_types];
+
+void populate_proj_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = proj_fn_ns;
+
+    using fn_ns::ProjContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ProjContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(proj_contig_dispatch_vector);
+
+    using fn_ns::ProjStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ProjStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(proj_strided_dispatch_vector);
+
+    using fn_ns::ProjTypeMapFactory;
+    DispatchVectorBuilder<int, ProjTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(proj_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_proj(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_proj_dispatch_vectors();
+        using impl::proj_contig_dispatch_vector;
+        using impl::proj_output_typeid_vector;
+        using impl::proj_strided_dispatch_vector;
+
+        auto proj_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, proj_output_typeid_vector,
+                proj_contig_dispatch_vector, proj_strided_dispatch_vector);
+        };
+        m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto proj_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector);
+        };
+        m.def("_proj_result_type", proj_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp
new file mode 100644
index 0000000000..efbe751455
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_proj(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp
new file mode 100644
index 0000000000..890a308a4e
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "real.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/real.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U27: ==== REAL   (x)
+namespace impl
+{
+
+namespace real_fn_ns = dpctl::tensor::kernels::real;
+
+static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types];
+static int real_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    real_strided_dispatch_vector[td_ns::num_types];
+
+void populate_real_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = real_fn_ns;
+
+    using fn_ns::RealContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RealContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(real_contig_dispatch_vector);
+
+    using fn_ns::RealStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RealStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(real_strided_dispatch_vector);
+
+    using fn_ns::RealTypeMapFactory;
+    DispatchVectorBuilder<int, RealTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(real_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_real(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_real_dispatch_vectors();
+        using impl::real_contig_dispatch_vector;
+        using impl::real_output_typeid_vector;
+        using impl::real_strided_dispatch_vector;
+
+        auto real_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, real_output_typeid_vector,
+                real_contig_dispatch_vector, real_strided_dispatch_vector);
+        };
+        m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto real_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, real_output_typeid_vector);
+        };
+        m.def("_real_result_type", real_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp
new file mode 100644
index 0000000000..b380632448
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_real(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp
new file mode 100644
index 0000000000..3255ea7e7f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp
@@ -0,0 +1,190 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "remainder.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/remainder.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B22: ===== REMAINDER (x1, x2)
+namespace impl
+{
+
+namespace remainder_fn_ns = dpctl::tensor::kernels::remainder;
+
+static binary_contig_impl_fn_ptr_t
+    remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int remainder_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    remainder_inplace_strided_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+void populate_remainder_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = remainder_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::RemainderTypeMapFactory;
+    DispatchTableBuilder<int, RemainderTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(remainder_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::RemainderStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, RemainderStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(remainder_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::RemainderContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, RemainderContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(remainder_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::RemainderInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         RemainderInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::RemainderInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         RemainderInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table);
+}
+
+} // namespace impl
+
+void init_remainder(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_remainder_dispatch_tables();
+        using impl::remainder_contig_dispatch_table;
+        using impl::remainder_output_id_table;
+        using impl::remainder_strided_dispatch_table;
+
+        auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, remainder_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                remainder_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                remainder_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto remainder_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               remainder_output_id_table);
+        };
+        m.def("_remainder", remainder_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_remainder_result_type", remainder_result_type_pyapi, "");
+
+        using impl::remainder_inplace_contig_dispatch_table;
+        using impl::remainder_inplace_strided_dispatch_table;
+
+        auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                           sycl::queue &exec_q,
+                                           const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, remainder_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                remainder_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                remainder_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp
new file mode 100644
index 0000000000..ef538547a8
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_remainder(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp
new file mode 100644
index 0000000000..cce730b899
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "round.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/round.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U28: ==== ROUND   (x)
+namespace impl
+{
+
+namespace round_fn_ns = dpctl::tensor::kernels::round;
+
+static unary_contig_impl_fn_ptr_t
+    round_contig_dispatch_vector[td_ns::num_types];
+static int round_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    round_strided_dispatch_vector[td_ns::num_types];
+
+void populate_round_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = round_fn_ns;
+
+    using fn_ns::RoundContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RoundContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(round_contig_dispatch_vector);
+
+    using fn_ns::RoundStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RoundStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(round_strided_dispatch_vector);
+
+    using fn_ns::RoundTypeMapFactory;
+    DispatchVectorBuilder<int, RoundTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(round_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_round(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_round_dispatch_vectors();
+        using impl::round_contig_dispatch_vector;
+        using impl::round_output_typeid_vector;
+        using impl::round_strided_dispatch_vector;
+
+        auto round_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, round_output_typeid_vector,
+                round_contig_dispatch_vector, round_strided_dispatch_vector);
+        };
+        m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto round_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              round_output_typeid_vector);
+        };
+        m.def("_round_result_type", round_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp
new file mode 100644
index 0000000000..5753ef233b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_round(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
new file mode 100644
index 0000000000..4661fdfa48
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+
+#include "elementwise_functions.hpp"
+#include "rsqrt.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/rsqrt.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U39: ==== RSQRT   (x)
+namespace impl
+{
+
+namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt;
+
+static unary_contig_impl_fn_ptr_t
+    rsqrt_contig_dispatch_vector[td_ns::num_types];
+static int rsqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    rsqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_rsqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = rsqrt_fn_ns;
+
+    using fn_ns::RsqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RsqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector);
+
+    using fn_ns::RsqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RsqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector);
+
+    using fn_ns::RsqrtTypeMapFactory;
+    DispatchVectorBuilder<int, RsqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_rsqrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_rsqrt_dispatch_vectors();
+        using impl::rsqrt_contig_dispatch_vector;
+        using impl::rsqrt_output_typeid_vector;
+        using impl::rsqrt_strided_dispatch_vector;
+
+        auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, rsqrt_output_typeid_vector,
+                rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector);
+        };
+        m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              rsqrt_output_typeid_vector);
+        };
+        m.def("_rsqrt_result_type", rsqrt_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
new file mode 100644
index 0000000000..50efc16d79
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_rsqrt(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp
new file mode 100644
index 0000000000..7b7c2c22e5
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "sign.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sign.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U29: ==== SIGN   (x)
+namespace impl
+{
+
+namespace sign_fn_ns = dpctl::tensor::kernels::sign;
+
+static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types];
+static int sign_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sign_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sign_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sign_fn_ns;
+
+    using fn_ns::SignContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sign_contig_dispatch_vector);
+
+    using fn_ns::SignStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sign_strided_dispatch_vector);
+
+    using fn_ns::SignTypeMapFactory;
+    DispatchVectorBuilder<int, SignTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sign_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sign(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sign_dispatch_vectors();
+        using impl::sign_contig_dispatch_vector;
+        using impl::sign_output_typeid_vector;
+        using impl::sign_strided_dispatch_vector;
+
+        auto sign_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sign_output_typeid_vector,
+                sign_contig_dispatch_vector, sign_strided_dispatch_vector);
+        };
+        m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sign_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector);
+        };
+        m.def("_sign_result_type", sign_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp
new file mode 100644
index 0000000000..fa01370842
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_sign(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp
new file mode 100644
index 0000000000..fc101dd64b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp
@@ -0,0 +1,122 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "signbit.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/signbit.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U41: ==== SIGNBIT   (x)
+namespace impl
+{
+
+namespace signbit_fn_ns = dpctl::tensor::kernels::signbit;
+
+static unary_contig_impl_fn_ptr_t
+    signbit_contig_dispatch_vector[td_ns::num_types];
+static int signbit_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    signbit_strided_dispatch_vector[td_ns::num_types];
+
+void populate_signbit_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = signbit_fn_ns;
+
+    using fn_ns::SignbitContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignbitContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector);
+
+    using fn_ns::SignbitStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignbitStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector);
+
+    using fn_ns::SignbitTypeMapFactory;
+    DispatchVectorBuilder<int, SignbitTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(signbit_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_signbit(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_signbit_dispatch_vectors();
+        using impl::signbit_contig_dispatch_vector;
+        using impl::signbit_output_typeid_vector;
+        using impl::signbit_strided_dispatch_vector;
+
+        auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                 sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  signbit_output_typeid_vector,
+                                  signbit_contig_dispatch_vector,
+                                  signbit_strided_dispatch_vector);
+        };
+        m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto signbit_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              signbit_output_typeid_vector);
+        };
+        m.def("_signbit_result_type", signbit_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp
new file mode 100644
index 0000000000..85054bb4de
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_signbit(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp
new file mode 100644
index 0000000000..415dc15133
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "sin.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sin.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U30: ==== SIN   (x)
+namespace impl
+{
+
+namespace sin_fn_ns = dpctl::tensor::kernels::sin;
+
+static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types];
+static int sin_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sin_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sin_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sin_fn_ns;
+
+    using fn_ns::SinContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sin_contig_dispatch_vector);
+
+    using fn_ns::SinStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sin_strided_dispatch_vector);
+
+    using fn_ns::SinTypeMapFactory;
+    DispatchVectorBuilder<int, SinTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sin_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sin_dispatch_vectors();
+        using impl::sin_contig_dispatch_vector;
+        using impl::sin_output_typeid_vector;
+        using impl::sin_strided_dispatch_vector;
+
+        auto sin_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sin_output_typeid_vector,
+                sin_contig_dispatch_vector, sin_strided_dispatch_vector);
+        };
+        m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sin_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector);
+        };
+        m.def("_sin_result_type", sin_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp
new file mode 100644
index 0000000000..bd03604b16
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_sin(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp
new file mode 100644
index 0000000000..d9f92eb8f1
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "sinh.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sinh.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U31: ==== SINH   (x)
+namespace impl
+{
+
+namespace sinh_fn_ns = dpctl::tensor::kernels::sinh;
+
+static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types];
+static int sinh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sinh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sinh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sinh_fn_ns;
+
+    using fn_ns::SinhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector);
+
+    using fn_ns::SinhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector);
+
+    using fn_ns::SinhTypeMapFactory;
+    DispatchVectorBuilder<int, SinhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sinh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sinh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sinh_dispatch_vectors();
+        using impl::sinh_contig_dispatch_vector;
+        using impl::sinh_output_typeid_vector;
+        using impl::sinh_strided_dispatch_vector;
+
+        auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sinh_output_typeid_vector,
+                sinh_contig_dispatch_vector, sinh_strided_dispatch_vector);
+        };
+        m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sinh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector);
+        };
+        m.def("_sinh_result_type", sinh_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp
new file mode 100644
index 0000000000..fef8ec416a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_sinh(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp
new file mode 100644
index 0000000000..159d45b51c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "sqrt.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sqrt.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U33: ==== SQRT   (x)
+namespace impl
+{
+
+namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt;
+
+static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types];
+static int sqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sqrt_fn_ns;
+
+    using fn_ns::SqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector);
+
+    using fn_ns::SqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector);
+
+    using fn_ns::SqrtTypeMapFactory;
+    DispatchVectorBuilder<int, SqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sqrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sqrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sqrt_dispatch_vectors();
+        using impl::sqrt_contig_dispatch_vector;
+        using impl::sqrt_output_typeid_vector;
+        using impl::sqrt_strided_dispatch_vector;
+
+        auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sqrt_output_typeid_vector,
+                sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector);
+        };
+        m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector);
+        };
+        m.def("_sqrt_result_type", sqrt_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp
new file mode 100644
index 0000000000..38ea68635b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_sqrt(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp
new file mode 100644
index 0000000000..184e09c19c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "square.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/square.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U32: ==== SQUARE   (x)
+namespace impl
+{
+
+namespace square_fn_ns = dpctl::tensor::kernels::square;
+
+static unary_contig_impl_fn_ptr_t
+    square_contig_dispatch_vector[td_ns::num_types];
+static int square_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    square_strided_dispatch_vector[td_ns::num_types];
+
+void populate_square_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = square_fn_ns;
+
+    using fn_ns::SquareContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SquareContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(square_contig_dispatch_vector);
+
+    using fn_ns::SquareStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SquareStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(square_strided_dispatch_vector);
+
+    using fn_ns::SquareTypeMapFactory;
+    DispatchVectorBuilder<int, SquareTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(square_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_square(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_square_dispatch_vectors();
+        using impl::square_contig_dispatch_vector;
+        using impl::square_output_typeid_vector;
+        using impl::square_strided_dispatch_vector;
+
+        auto square_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, square_output_typeid_vector,
+                square_contig_dispatch_vector, square_strided_dispatch_vector);
+        };
+        m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto square_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              square_output_typeid_vector);
+        };
+        m.def("_square_result_type", square_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp
new file mode 100644
index 0000000000..d8268b728a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_square(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp
new file mode 100644
index 0000000000..9703182e7a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp
@@ -0,0 +1,229 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "subtract.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/subtract.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B23: ===== SUBTRACT (x1, x2)
+namespace impl
+{
+namespace subtract_fn_ns = dpctl::tensor::kernels::subtract;
+
+static binary_contig_impl_fn_ptr_t
+    subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int subtract_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// sub(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    subtract_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// sub(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    subtract_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    subtract_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_subtract_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = subtract_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::SubtractTypeMapFactory;
+    DispatchTableBuilder<int, SubtractTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(subtract_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::SubtractStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, SubtractStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(subtract_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::SubtractContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, SubtractContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(subtract_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::SubtractContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        SubtractContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        subtract_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::SubtractContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        SubtractContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::SubtractInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         SubtractInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::SubtractInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         SubtractInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::SubtractInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         SubtractInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table);
+};
+
+} // namespace impl
+
+void init_subtract(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_subtract_dispatch_tables();
+        using impl::subtract_contig_dispatch_table;
+        using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::subtract_output_id_table;
+        using impl::subtract_strided_dispatch_table;
+
+        auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, subtract_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                subtract_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                subtract_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto subtract_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               subtract_output_id_table);
+        };
+        m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_subtract_result_type", subtract_result_type_pyapi, "");
+
+        using impl::subtract_inplace_contig_dispatch_table;
+        using impl::subtract_inplace_row_matrix_dispatch_table;
+        using impl::subtract_inplace_strided_dispatch_table;
+
+        auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, subtract_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                subtract_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                subtract_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                subtract_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp
new file mode 100644
index 0000000000..0a4d707865
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_subtract(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp
new file mode 100644
index 0000000000..2f1fbf55f2
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "tan.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/tan.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U34: ==== TAN   (x)
+namespace impl
+{
+
+namespace tan_fn_ns = dpctl::tensor::kernels::tan;
+
+static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types];
+static int tan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    tan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_tan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = tan_fn_ns;
+
+    using fn_ns::TanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(tan_contig_dispatch_vector);
+
+    using fn_ns::TanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(tan_strided_dispatch_vector);
+
+    using fn_ns::TanTypeMapFactory;
+    DispatchVectorBuilder<int, TanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(tan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_tan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_tan_dispatch_vectors();
+        using impl::tan_contig_dispatch_vector;
+        using impl::tan_output_typeid_vector;
+        using impl::tan_strided_dispatch_vector;
+
+        auto tan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, tan_output_typeid_vector,
+                tan_contig_dispatch_vector, tan_strided_dispatch_vector);
+        };
+        m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto tan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector);
+        };
+        m.def("_tan_result_type", tan_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp
new file mode 100644
index 0000000000..f89c8b8f6d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_tanh(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp
new file mode 100644
index 0000000000..033389e46d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp
@@ -0,0 +1,119 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "tanh.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/tanh.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U35: ==== TANH   (x)
+namespace impl
+{
+
+namespace tanh_fn_ns = dpctl::tensor::kernels::tanh;
+
+static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types];
+static int tanh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    tanh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_tanh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = tanh_fn_ns;
+
+    using fn_ns::TanhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector);
+
+    using fn_ns::TanhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector);
+
+    using fn_ns::TanhTypeMapFactory;
+    DispatchVectorBuilder<int, TanhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(tanh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_tanh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_tanh_dispatch_vectors();
+        using impl::tanh_contig_dispatch_vector;
+        using impl::tanh_output_typeid_vector;
+        using impl::tanh_strided_dispatch_vector;
+
+        auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, tanh_output_typeid_vector,
+                tanh_contig_dispatch_vector, tanh_strided_dispatch_vector);
+        };
+        m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto tanh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector);
+        };
+        m.def("_tanh_result_type", tanh_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp
new file mode 100644
index 0000000000..e456182971
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_tan(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
new file mode 100644
index 0000000000..22ad9bf3cb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
@@ -0,0 +1,241 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "true_divide.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/true_divide.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B08: ===== DIVIDE (x1, x2)
+namespace impl
+{
+namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide;
+
+static binary_contig_impl_fn_ptr_t
+    true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types];
+static int true_divide_inplace_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// divide(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    true_divide_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// divide(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    true_divide_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    true_divide_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    true_divide_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+void populate_true_divide_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = true_divide_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::TrueDivideTypeMapFactory;
+    DispatchTableBuilder<int, TrueDivideTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(true_divide_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::TrueDivideStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, TrueDivideStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(true_divide_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::TrueDivideContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, TrueDivideContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(true_divide_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        TrueDivideContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        true_divide_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        TrueDivideContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::TrueDivideInplaceTypeMapFactory;
+    DispatchTableBuilder<int, TrueDivideInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(true_divide_inplace_output_id_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::TrueDivideInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         TrueDivideInplaceStridedFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::TrueDivideInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         TrueDivideInplaceContigFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         TrueDivideInplaceRowMatrixBroadcastFactory, num_types>
+        dtb9;
+    dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table);
+};
+
+} // namespace impl
+
+void init_divide(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_true_divide_dispatch_tables();
+        using impl::true_divide_contig_dispatch_table;
+        using impl::
+            true_divide_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::
+            true_divide_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::true_divide_output_id_table;
+        using impl::true_divide_strided_dispatch_table;
+
+        auto divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, true_divide_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                true_divide_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                true_divide_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                true_divide_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto divide_result_type_pyapi = [&](const py::dtype &dtype1,
+                                            const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               true_divide_output_id_table);
+        };
+        m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_divide_result_type", divide_result_type_pyapi, "");
+
+        using impl::true_divide_inplace_contig_dispatch_table;
+        using impl::true_divide_inplace_output_id_table;
+        using impl::true_divide_inplace_row_matrix_dispatch_table;
+        using impl::true_divide_inplace_strided_dispatch_table;
+
+        auto divide_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                        sycl::queue &exec_q,
+                                        const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, true_divide_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                true_divide_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                true_divide_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                true_divide_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp
new file mode 100644
index 0000000000..e29b858dae
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_divide(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp
new file mode 100644
index 0000000000..5b2f451fb0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp
@@ -0,0 +1,121 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "elementwise_functions.hpp"
+#include "trunc.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/trunc.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U36: ==== TRUNC   (x)
+namespace impl
+{
+
+namespace trunc_fn_ns = dpctl::tensor::kernels::trunc;
+
+static unary_contig_impl_fn_ptr_t
+    trunc_contig_dispatch_vector[td_ns::num_types];
+static int trunc_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    trunc_strided_dispatch_vector[td_ns::num_types];
+
+void populate_trunc_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = trunc_fn_ns;
+
+    using fn_ns::TruncContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TruncContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector);
+
+    using fn_ns::TruncStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TruncStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector);
+
+    using fn_ns::TruncTypeMapFactory;
+    DispatchVectorBuilder<int, TruncTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(trunc_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_trunc(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_trunc_dispatch_vectors();
+        using impl::trunc_contig_dispatch_vector;
+        using impl::trunc_output_typeid_vector;
+        using impl::trunc_strided_dispatch_vector;
+
+        auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, trunc_output_typeid_vector,
+                trunc_contig_dispatch_vector, trunc_strided_dispatch_vector);
+        };
+        m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto trunc_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              trunc_output_typeid_vector);
+        };
+        m.def("_trunc_result_type", trunc_result_type_pyapi);
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp
new file mode 100644
index 0000000000..cc28397f55
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_trunc(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/argmax.cpp b/dpctl/tensor/libtensor/source/reductions/argmax.cpp
new file mode 100644
index 0000000000..1d83bf9c2d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/argmax.cpp
@@ -0,0 +1,119 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+void populate_argmax_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::ArgmaxOverAxisTempsStridedFactory;
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgmaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::ArgmaxOverAxis1TempsContigFactory;
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::ArgmaxOverAxis0TempsContigFactory;
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmax(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmax_over_axis_dispatch_tables;
+        populate_argmax_over_axis_dispatch_tables();
+        using impl::argmax_over_axis0_contig_temps_dispatch_table;
+        using impl::argmax_over_axis1_contig_temps_dispatch_table;
+        using impl::argmax_over_axis_strided_temps_dispatch_table;
+
+        auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_search_over_axis;
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmax_over_axis_strided_temps_dispatch_table,
+                argmax_over_axis0_contig_temps_dispatch_table,
+                argmax_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.hpp b/dpctl/tensor/libtensor/source/reductions/argmax.hpp
similarity index 90%
rename from dpctl/tensor/libtensor/source/sum_reductions.hpp
rename to dpctl/tensor/libtensor/source/reductions/argmax.hpp
index ac612ec1f7..9958396b43 100644
--- a/dpctl/tensor/libtensor/source/sum_reductions.hpp
+++ b/dpctl/tensor/libtensor/source/reductions/argmax.hpp
@@ -2,7 +2,7 @@
 //
 //                      Data Parallel Control (dpctl)
 //
-// Copyright 2020-2022 Intel Corporation
+// Copyright 2020-2023 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -23,9 +23,10 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <pybind11/pybind11.h>
 
+namespace py = pybind11;
+
 namespace dpctl
 {
 namespace tensor
@@ -33,7 +34,7 @@ namespace tensor
 namespace py_internal
 {
 
-extern void init_reduction_functions(py::module_ m);
+extern void init_argmax(py::module_ m);
 
 } // namespace py_internal
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.cpp b/dpctl/tensor/libtensor/source/reductions/argmin.cpp
new file mode 100644
index 0000000000..c6469e6864
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/argmin.cpp
@@ -0,0 +1,119 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+void populate_argmin_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::ArgminOverAxisTempsStridedFactory;
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgminOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::ArgminOverAxis1TempsContigFactory;
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::ArgminOverAxis0TempsContigFactory;
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmin_over_axis_dispatch_tables;
+        populate_argmin_over_axis_dispatch_tables();
+        using impl::argmin_over_axis0_contig_temps_dispatch_table;
+        using impl::argmin_over_axis1_contig_temps_dispatch_table;
+        using impl::argmin_over_axis_strided_temps_dispatch_table;
+
+        auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_search_over_axis;
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmin_over_axis_strided_temps_dispatch_table,
+                argmin_over_axis0_contig_temps_dispatch_table,
+                argmin_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.hpp b/dpctl/tensor/libtensor/source/reductions/argmin.hpp
new file mode 100644
index 0000000000..ea6ef1931c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/argmin.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_argmin(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp
new file mode 100644
index 0000000000..e3b015a4e0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp
@@ -0,0 +1,136 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+void populate_logsumexp_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::LogSumExpOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         LogSumExpOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(
+        logsumexp_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::LogSumExpOverAxis1TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        logsumexp_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::LogSumExpOverAxis0TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        logsumexp_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_logsumexp_over_axis_dispatch_tables;
+        populate_logsumexp_over_axis_dispatch_tables();
+        using impl::logsumexp_over_axis0_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis1_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto logsumexp_pyapi = [&](const arrayT &src,
+                                   int trailing_dims_to_reduce,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                logsumexp_over_axis_strided_temps_dispatch_table,
+                logsumexp_over_axis0_contig_temps_dispatch_table,
+                logsumexp_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                             const py::dtype &output_dtype) {
+            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                logsumexp_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported,
+              "", py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp
new file mode 100644
index 0000000000..46b2156f46
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_logsumexp(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/max.cpp b/dpctl/tensor/libtensor/source/reductions/max.cpp
new file mode 100644
index 0000000000..32c60b943b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/max.cpp
@@ -0,0 +1,171 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_max_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::MaxOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxis1TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxis0TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types];
+
+void populate_max_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MaxAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MaxAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(max_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_max(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_max_over_axis_dispatch_tables;
+        populate_max_over_axis_dispatch_tables();
+        using impl::max_over_axis0_contig_atomic_dispatch_table;
+        using impl::max_over_axis0_contig_temps_dispatch_table;
+        using impl::max_over_axis1_contig_atomic_dispatch_table;
+        using impl::max_over_axis1_contig_temps_dispatch_table;
+        using impl::max_over_axis_strided_atomic_dispatch_table;
+        using impl::max_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_max_atomic_support_dispatch_vector;
+        populate_max_atomic_support_dispatch_vector();
+        using impl::max_atomic_support_vector;
+
+        auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                max_over_axis_strided_atomic_dispatch_table,
+                max_over_axis0_contig_atomic_dispatch_table,
+                max_over_axis1_contig_atomic_dispatch_table,
+                max_over_axis_strided_temps_dispatch_table,
+                max_over_axis0_contig_temps_dispatch_table,
+                max_over_axis1_contig_temps_dispatch_table,
+                max_atomic_support_vector);
+        };
+        m.def("_max_over_axis", max_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/max.hpp b/dpctl/tensor/libtensor/source/reductions/max.hpp
new file mode 100644
index 0000000000..05a31fc1fb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/max.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_max(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/min.cpp b/dpctl/tensor/libtensor/source/reductions/min.cpp
new file mode 100644
index 0000000000..de1a81387d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/min.cpp
@@ -0,0 +1,173 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_min_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::MinOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxis1TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxis0TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types];
+
+void populate_min_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MinAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MinAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(min_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_min(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_min_over_axis_dispatch_tables;
+        populate_min_over_axis_dispatch_tables();
+        using impl::min_over_axis0_contig_atomic_dispatch_table;
+        using impl::min_over_axis0_contig_temps_dispatch_table;
+        using impl::min_over_axis1_contig_atomic_dispatch_table;
+        using impl::min_over_axis1_contig_temps_dispatch_table;
+        using impl::min_over_axis_strided_atomic_dispatch_table;
+        using impl::min_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_min_atomic_support_dispatch_vector;
+        populate_min_atomic_support_dispatch_vector();
+        using impl::min_atomic_support_vector;
+
+        auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                min_over_axis_strided_atomic_dispatch_table,
+                min_over_axis0_contig_atomic_dispatch_table,
+                min_over_axis1_contig_atomic_dispatch_table,
+                min_over_axis_strided_temps_dispatch_table,
+                min_over_axis0_contig_temps_dispatch_table,
+                min_over_axis1_contig_temps_dispatch_table,
+                min_atomic_support_vector);
+        };
+        m.def("_min_over_axis", min_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/min.hpp b/dpctl/tensor/libtensor/source/reductions/min.hpp
new file mode 100644
index 0000000000..cad94c7533
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/min.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_min(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/prod.cpp b/dpctl/tensor/libtensor/source/reductions/prod.cpp
new file mode 100644
index 0000000000..a90d04304a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/prod.cpp
@@ -0,0 +1,187 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+void populate_prod_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::ProductOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxis1TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxis0TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types];
+
+void populate_prod_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::ProductAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, ProductAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(prod_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_prod_over_axis_dispatch_tables;
+        populate_prod_over_axis_dispatch_tables();
+        using impl::prod_over_axis0_contig_atomic_dispatch_table;
+        using impl::prod_over_axis0_contig_temps_dispatch_table;
+        using impl::prod_over_axis1_contig_atomic_dispatch_table;
+        using impl::prod_over_axis1_contig_temps_dispatch_table;
+        using impl::prod_over_axis_strided_atomic_dispatch_table;
+        using impl::prod_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_prod_atomic_support_dispatch_vector;
+        populate_prod_atomic_support_dispatch_vector();
+        using impl::prod_atomic_support_vector;
+
+        auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                prod_over_axis_strided_atomic_dispatch_table,
+                prod_over_axis0_contig_atomic_dispatch_table,
+                prod_over_axis1_contig_atomic_dispatch_table,
+                prod_over_axis_strided_temps_dispatch_table,
+                prod_over_axis0_contig_temps_dispatch_table,
+                prod_over_axis1_contig_temps_dispatch_table,
+                prod_atomic_support_vector);
+        };
+        m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto prod_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    prod_over_axis_strided_atomic_dispatch_table,
+                    prod_over_axis_strided_temps_dispatch_table,
+                    prod_atomic_support_vector);
+            };
+        m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/prod.hpp b/dpctl/tensor/libtensor/source/reductions/prod.hpp
new file mode 100644
index 0000000000..026e7d8923
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/prod.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_prod(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp
new file mode 100644
index 0000000000..c7313930b4
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp
@@ -0,0 +1,132 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+void populate_hypot_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::HypotOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         HypotOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::HypotOverAxis1TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::HypotOverAxis0TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_reduce_hypot(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_hypot_over_axis_dispatch_tables;
+        populate_hypot_over_axis_dispatch_tables();
+        using impl::hypot_over_axis0_contig_temps_dispatch_table;
+        using impl::hypot_over_axis1_contig_temps_dispatch_table;
+        using impl::hypot_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                hypot_over_axis_strided_temps_dispatch_table,
+                hypot_over_axis0_contig_temps_dispatch_table,
+                hypot_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto hypot_dtype_supported = [&](const py::dtype &input_dtype,
+                                         const py::dtype &output_dtype) {
+            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                hypot_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp
new file mode 100644
index 0000000000..92b7fac363
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_reduce_hypot(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
new file mode 100644
index 0000000000..2478545efe
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -0,0 +1,143 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <complex>
+#include <type_traits>
+
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+namespace atomic_support
+{
+
+typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc);
+
+/*! @brief Function which returns a constant value for atomic support */
+template <bool return_value>
+bool fixed_decision(const sycl::queue &, sycl::usm::alloc)
+{
+    return return_value;
+}
+
+/*! @brief Template for querying atomic support for a type on a device */
+template <typename T>
+bool check_atomic_support(const sycl::queue &exec_q,
+                          sycl::usm::alloc usm_alloc_type)
+{
+    constexpr bool atomic32 = (sizeof(T) == 4);
+    constexpr bool atomic64 = (sizeof(T) == 8);
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr ((!atomic32 && !atomic64) || is_complex<T>::value) {
+        return fixed_decision<false>(exec_q, usm_alloc_type);
+    }
+    else {
+        bool supports_atomics = false;
+        const sycl::device &dev = exec_q.get_device();
+        if constexpr (atomic64) {
+            if (!dev.has(sycl::aspect::atomic64)) {
+                return false;
+            }
+        }
+        switch (usm_alloc_type) {
+        case sycl::usm::alloc::shared:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_shared_allocations);
+            break;
+        case sycl::usm::alloc::host:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_host_allocations);
+            break;
+        case sycl::usm::alloc::device:
+            supports_atomics = true;
+            break;
+        default:
+            supports_atomics = false;
+        }
+        return supports_atomics;
+    }
+}
+
+template <typename fnT, typename T> struct ArithmeticAtomicSupportFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (std::is_floating_point_v<T> ||
+                      std::is_same_v<T, sycl::half> || is_complex<T>::value)
+        {
+            // for real- and complex- floating point types, tree reduction has
+            // better round-off accumulation properties (round-off error is
+            // proportional to the log2(reduction_size), while naive elementwise
+            // summation used by atomic implementation has round-off error
+            // growing proportional to the reduction_size.), hence reduction
+            // over floating point types should always use tree_reduction
+            // algorithm, even though atomic implementation may be applicable
+            return fixed_decision<false>;
+        }
+        else {
+            return check_atomic_support<T>;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct MinMaxAtomicSupportFactory
+{
+    fnT get()
+    {
+        return check_atomic_support<T>;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct ProductAtomicSupportFactory
+    : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+} // namespace atomic_support
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp
new file mode 100644
index 0000000000..99edf663ad
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -0,0 +1,60 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "argmax.hpp"
+#include "argmin.hpp"
+#include "logsumexp.hpp"
+#include "max.hpp"
+#include "min.hpp"
+#include "prod.hpp"
+#include "reduce_hypot.hpp"
+#include "sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+/*! @brief Add reduction functions to Python module */
+void init_reduction_functions(py::module_ m)
+{
+    init_argmax(m);
+    init_argmin(m);
+    init_logsumexp(m);
+    init_max(m);
+    init_min(m);
+    init_prod(m);
+    init_reduce_hypot(m);
+    init_sum(m);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp
new file mode 100644
index 0000000000..61c992364a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_reduction_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
new file mode 100644
index 0000000000..5aafe38a40
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -0,0 +1,1099 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for reductions.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+/* ====================== dtype supported ======================== */
+
+/*! @brief Template implementing Python API for querying type support by
+ * reduction which may support atomics */
+template <typename fnT, typename CheckAtomicSupportFnT>
+bool py_reduction_dtype_supported(
+    const py::dtype &input_dtype,
+    const py::dtype &output_dtype,
+    const std::string &dst_usm_type,
+    sycl::queue &q,
+    const fnT &atomic_dispatch_table,
+    const fnT &temps_dispatch_table,
+    const CheckAtomicSupportFnT &check_atomic_support)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
+
+    if (dst_usm_type == "device") {
+        kind = sycl::usm::alloc::device;
+    }
+    else if (dst_usm_type == "shared") {
+        kind = sycl::usm::alloc::shared;
+    }
+    else if (dst_usm_type == "host") {
+        kind = sycl::usm::alloc::host;
+    }
+    else {
+        throw py::value_error("Unrecognized `dst_usm_type` argument.");
+    }
+
+    bool supports_atomics = check_atomic_support[out_typeid](q, kind);
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    return (fn != nullptr);
+}
+
+/*! @brief Template implementing Python API for querying type support by tree
+ * reduction */
+template <typename fnT>
+bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype,
+                                       const fnT &temps_dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    auto fn = temps_dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+/* ==================== Generic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis which may
+ * support atomics */
+template <typename strided_fnT, typename contig_fnT, typename SupportAtomicFnT>
+std::pair<sycl::event, sycl::event> py_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &atomic_dispatch_table,
+    const contig_fnT &axis0_atomic_dispatch_table,
+    const contig_fnT &axis1_atomic_dispatch_table,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table,
+    const SupportAtomicFnT &check_atomic_support)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < dst_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+    bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1))
+    {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            size_t iter_nelems = dst_nelems;
+
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            size_t iter_nelems = dst_nelems;
+
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+    using dpctl::tensor::py_internal::simplify_iteration_space_1;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
+                 reduction_nelems);
+        }
+        else if (static_cast<size_t>(simplified_reduction_src_strides[0]) ==
+                 iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    // remove_all_extents gets underlying type of table
+    using strided_fn_ptr_T =
+        typename std::remove_all_extents<strided_fnT>::type;
+    strided_fn_ptr_T fn = nullptr;
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[src_typeid][dst_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    py::ssize_t *temp_allocation_ptr =
+        std::get<0>(arrays_metainfo_packing_triple_);
+    if (temp_allocation_ptr == nullptr) {
+        throw std::runtime_error("Unable to allocate memory on device");
+    }
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+
+    py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(reduction_ev);
+        const auto &ctx = exec_q.get_context();
+        cgh.host_task([ctx, temp_allocation_ptr] {
+            sycl::free(temp_allocation_ptr, ctx);
+        });
+    });
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/* ================= No atomic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis without
+ * atomics */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < dst_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1))
+    {
+        auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            size_t iter_nelems = dst_nelems;
+
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            size_t iter_nelems = dst_nelems;
+
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+    using dpctl::tensor::py_internal::simplify_iteration_space_1;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
+                 reduction_nelems);
+        }
+        else if (static_cast<size_t>(simplified_reduction_src_strides[0]) ==
+                 iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = temps_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    py::ssize_t *temp_allocation_ptr =
+        std::get<0>(arrays_metainfo_packing_triple_);
+    if (temp_allocation_ptr == nullptr) {
+        throw std::runtime_error("Unable to allocate memory on device");
+    }
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+
+    py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(reduction_ev);
+        const auto &ctx = exec_q.get_context();
+        cgh.host_task([ctx, temp_allocation_ptr] {
+            sycl::free(temp_allocation_ptr, ctx);
+        });
+    });
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/*! @brief Template implementing Python API for searching over an axis */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_search_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &axis0_contig_dispatch_table,
+    const contig_fnT &axis1_contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < dst_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            size_t iter_nelems = dst_nelems;
+
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig && dst_nd == 1) {
+        auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            size_t iter_nelems = dst_nelems;
+
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT compact_reduction_shape;
+    shT compact_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    compact_iteration_space(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        compact_reduction_shape, compact_reduction_src_strides);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        size_t iter_nelems = dst_nelems;
+
+        if (compact_reduction_src_strides[0] == 1) {
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
+                 reduction_nelems);
+        }
+        else if (static_cast<size_t>(compact_reduction_src_strides[0]) ==
+                 iter_nelems) {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1) {
+            auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    const auto &arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            compact_reduction_shape, compact_reduction_src_strides);
+    py::ssize_t *temp_allocation_ptr =
+        std::get<0>(arrays_metainfo_packing_triple_);
+    if (temp_allocation_ptr == nullptr) {
+        throw std::runtime_error("Unable to allocate memory on device");
+    }
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+
+    py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
+                      dst.get_data(), iteration_nd, iter_shape_and_strides,
+                      iteration_src_offset, iteration_dst_offset,
+                      reduction_nd, // number dimensions being reduced
+                      reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        const auto &ctx = exec_q.get_context();
+        cgh.host_task([ctx, temp_allocation_ptr] {
+            sycl::free(temp_allocation_ptr, ctx);
+        });
+    });
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, comp_ev);
+}
+
+extern void init_reduction_functions(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/sum.cpp b/dpctl/tensor/libtensor/source/reductions/sum.cpp
new file mode 100644
index 0000000000..33803cfd7b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/sum.cpp
@@ -0,0 +1,187 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_sum_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis1TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis0TempsContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types];
+
+void populate_sum_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::SumAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, SumAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(sum_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_sum_over_axis_dispatch_tables;
+        populate_sum_over_axis_dispatch_tables();
+        using impl::sum_over_axis0_contig_atomic_dispatch_table;
+        using impl::sum_over_axis0_contig_temps_dispatch_table;
+        using impl::sum_over_axis1_contig_atomic_dispatch_table;
+        using impl::sum_over_axis1_contig_temps_dispatch_table;
+        using impl::sum_over_axis_strided_atomic_dispatch_table;
+        using impl::sum_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_sum_atomic_support_dispatch_vector;
+        populate_sum_atomic_support_dispatch_vector();
+        using impl::sum_atomic_support_vector;
+
+        auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                sum_over_axis_strided_atomic_dispatch_table,
+                sum_over_axis0_contig_atomic_dispatch_table,
+                sum_over_axis1_contig_atomic_dispatch_table,
+                sum_over_axis_strided_temps_dispatch_table,
+                sum_over_axis0_contig_temps_dispatch_table,
+                sum_over_axis1_contig_temps_dispatch_table,
+                sum_atomic_support_vector);
+        };
+        m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sum_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    sum_over_axis_strided_atomic_dispatch_table,
+                    sum_over_axis_strided_temps_dispatch_table,
+                    sum_atomic_support_vector);
+            };
+        m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/sum.hpp b/dpctl/tensor/libtensor/source/reductions/sum.hpp
new file mode 100644
index 0000000000..ded0d14809
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reductions/sum.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_sum(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp
index 0dbfb17a5d..f3a20cbbaa 100644
--- a/dpctl/tensor/libtensor/source/repeat.cpp
+++ b/dpctl/tensor/libtensor/source/repeat.cpp
@@ -136,7 +136,6 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
     const py::ssize_t *dst_shape = dst.get_shape_raw();
     bool same_orthog_dims(true);
     size_t orthog_nelems(1); // number of orthogonal iterations
-
     for (auto i = 0; i < axis; ++i) {
         auto src_sh_i = src_shape[i];
         orthog_nelems *= src_sh_i;
@@ -237,18 +236,44 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
         assert(dst_shape_vec.size() == 1);
         assert(dst_strides_vec.size() == 1);
 
-        py::ssize_t src_shape(0);
-        py::ssize_t src_stride(0);
-        if (src_nd > 0) {
-            src_shape = src_shape_vec[0];
-            src_stride = src_strides_vec[0];
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        const auto &ptr_size_event_tuple1 =
+            device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        py::ssize_t *packed_src_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        if (packed_src_shape_strides == nullptr) {
+            throw std::runtime_error("Unable to allocate device memory");
         }
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
 
-        sycl::event repeat_ev =
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev =
             fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p,
-               cumsum_data_p, src_shape, src_stride, dst_shape_vec[0],
-               dst_strides_vec[0], reps_shape_vec[0], reps_strides_vec[0],
-               depends);
+               cumsum_data_p, src_nd, packed_src_shape_strides,
+               dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0],
+               reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(repeat_ev);
+                const auto &ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_src_shape_strides] {
+                    sycl::free(packed_src_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
     }
     else {
         // non-empty othogonal directions
@@ -343,6 +368,162 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
     return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
 }
 
+std::pair<sycl::event, sycl::event>
+py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      const dpctl::tensor::usm_ndarray &reps,
+                      const dpctl::tensor::usm_ndarray &cumsum,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends)
+{
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t src_sz = src.get_size();
+    size_t reps_sz = reps.get_size();
+    size_t cumsum_sz = cumsum.get_size();
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (src_sz != reps_sz || src_sz != cumsum_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accommodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(dst.get_size())) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accommodate all the "
+                "array elements.");
+        }
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, cumsum, or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    py::ssize_t *packed_src_shapes_strides = std::get<0>(ptr_size_event_tuple1);
+    if (packed_src_shapes_strides == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(
+        exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p,
+        src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0],
+        reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(repeat_ev);
+            const auto &ctx = exec_q.get_context();
+            cgh.host_task([ctx, packed_src_shapes_strides] {
+                sycl::free(packed_src_shapes_strides, ctx);
+            });
+        });
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+    host_task_events.push_back(repeat_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
 std::pair<sycl::event, sycl::event>
 py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
                     const dpctl::tensor::usm_ndarray &dst,
@@ -372,7 +553,6 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
     const py::ssize_t *dst_shape = dst.get_shape_raw();
     bool same_orthog_dims(true);
     size_t orthog_nelems(1); // number of orthogonal iterations
-
     for (auto i = 0; i < axis; ++i) {
         auto src_sh_i = src_shape[i];
         orthog_nelems *= src_sh_i;
@@ -452,15 +632,42 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
         assert(dst_shape_vec.size() == 1);
         assert(dst_strides_vec.size() == 1);
 
-        py::ssize_t src_shape(0);
-        py::ssize_t src_stride(0);
-        if (src_nd > 0) {
-            src_shape = src_shape_vec[0];
-            src_stride = src_strides_vec[0];
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
         }
-        sycl::event repeat_ev =
-            fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, src_shape,
-               src_stride, dst_shape_vec[0], dst_strides_vec[0], depends);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        const auto &ptr_size_event_tuple1 =
+            device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        py::ssize_t *packed_src_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        if (packed_src_shape_strides == nullptr) {
+            throw std::runtime_error("Unable to allocate device memory");
+        }
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps,
+                       src_nd, packed_src_shape_strides, dst_shape_vec[0],
+                       dst_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(repeat_ev);
+                const auto &ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_src_shape_strides] {
+                    sycl::free(packed_src_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
     }
     else {
         // non-empty othogonal directions
@@ -554,6 +761,126 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
     return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
 }
 
+std::pair<sycl::event, sycl::event>
+py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    const py::ssize_t reps,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t src_sz = src.get_size();
+    size_t dst_sz = dst.get_size();
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if ((src_sz * reps) != dst_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accommodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(src_sz * reps)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accommodate all the "
+                "array elements.");
+        }
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    py::ssize_t *packed_src_shape_strides = std::get<0>(ptr_size_event_tuple1);
+    if (packed_src_shape_strides == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps,
+                               src_nd, packed_src_shape_strides,
+                               dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(repeat_ev);
+            const auto &ctx = exec_q.get_context();
+            cgh.host_task([ctx, packed_src_shape_strides] {
+                sycl::free(packed_src_shape_strides, ctx);
+            });
+        });
+
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+    host_task_events.push_back(repeat_ev);
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
 } // namespace py_internal
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/repeat.hpp b/dpctl/tensor/libtensor/source/repeat.hpp
index 87fb0a0847..65ace36516 100644
--- a/dpctl/tensor/libtensor/source/repeat.hpp
+++ b/dpctl/tensor/libtensor/source/repeat.hpp
@@ -48,6 +48,14 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
                       sycl::queue &exec_q,
                       const std::vector<sycl::event> &depends);
 
+extern std::pair<sycl::event, sycl::event>
+py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      const dpctl::tensor::usm_ndarray &reps,
+                      const dpctl::tensor::usm_ndarray &cumsum,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends);
+
 extern std::pair<sycl::event, sycl::event>
 py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
                     const dpctl::tensor::usm_ndarray &dst,
@@ -56,6 +64,13 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
                     sycl::queue &exec_q,
                     const std::vector<sycl::event> &depends);
 
+extern std::pair<sycl::event, sycl::event>
+py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    const py::ssize_t reps,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends);
+
 } // namespace py_internal
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp
deleted file mode 100644
index 529096f5b6..0000000000
--- a/dpctl/tensor/libtensor/source/sum_reductions.cpp
+++ /dev/null
@@ -1,542 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2022 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cstddef>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/reductions.hpp"
-#include "sum_reductions.hpp"
-
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-bool check_atomic_support(const sycl::queue &exec_q,
-                          sycl::usm::alloc usm_alloc_type,
-                          bool require_atomic64 = false)
-{
-    bool supports_atomics = false;
-
-    const sycl::device &dev = exec_q.get_device();
-    if (require_atomic64) {
-        if (!dev.has(sycl::aspect::atomic64))
-            return false;
-    }
-
-    switch (usm_alloc_type) {
-    case sycl::usm::alloc::shared:
-        supports_atomics = dev.has(sycl::aspect::usm_atomic_shared_allocations);
-        break;
-    case sycl::usm::alloc::host:
-        supports_atomics = dev.has(sycl::aspect::usm_atomic_host_allocations);
-        break;
-    case sycl::usm::alloc::device:
-        supports_atomics = true;
-        break;
-    default:
-        supports_atomics = false;
-    }
-
-    return supports_atomics;
-}
-
-using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-static sum_reduction_strided_impl_fn_ptr
-    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static sum_reduction_strided_impl_fn_ptr
-    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr;
-static sum_reduction_contig_impl_fn_ptr
-    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static sum_reduction_contig_impl_fn_ptr
-    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-
-std::pair<sycl::event, sycl::event> py_sum_over_axis(
-    const dpctl::tensor::usm_ndarray &src,
-    int trailing_dims_to_reduce, // sum over this many trailing indexes
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    int iteration_nd = src_nd - trailing_dims_to_reduce;
-    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
-        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
-                              "greater than rank of the array being reduced");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != iteration_nd) {
-        throw py::value_error("Destination array rank does not match input "
-                              "array rank and number of reduced dimensions");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
-        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error("Destination shape does not match unreduced "
-                              "dimensions of the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    size_t dst_nelems = dst.get_size();
-
-    size_t reduction_nelems(1);
-    for (int i = dst_nd; i < src_nd; ++i) {
-        reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
-    }
-
-    // check that dst and src do not overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    // destination must be ample enough to accommodate all elements
-    {
-        auto dst_offsets = dst.get_minmax_offsets();
-        size_t range =
-            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
-        if (range + 1 < dst_nelems) {
-            throw py::value_error(
-                "Destination array can not accommodate all the "
-                "elements of source array.");
-        }
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    int dst_itemsize = dst.get_elemsize();
-    bool supports_atomics = false;
-
-    switch (dst_itemsize) {
-    case sizeof(float):
-    {
-        void *data_ptr = dst.get_data();
-        const auto &ctx = exec_q.get_context();
-        auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-        supports_atomics = check_atomic_support(exec_q, usm_type);
-    } break;
-    case sizeof(double):
-    {
-        void *data_ptr = dst.get_data();
-        const auto &ctx = exec_q.get_context();
-        auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-
-        constexpr bool require_atomic64 = true;
-        supports_atomics =
-            check_atomic_support(exec_q, usm_type, require_atomic64);
-    } break;
-    }
-
-    // handle special case when both reduction and iteration are 1D contiguous
-    // and can be done with atomics
-    if (supports_atomics) {
-        bool is_src_c_contig = src.is_c_contiguous();
-        bool is_dst_c_contig = dst.is_c_contiguous();
-        bool is_src_f_contig = src.is_f_contiguous();
-
-        if ((is_src_c_contig && is_dst_c_contig) ||
-            (is_src_f_contig && dst_nelems == 1))
-        {
-            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                size_t iter_nelems = dst_nelems;
-
-                constexpr py::ssize_t zero_offset = 0;
-
-                sycl::event sum_over_axis_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(),
-                       zero_offset, // iteration_src_offset
-                       zero_offset, // iteration_dst_offset
-                       zero_offset, // reduction_src_offset
-                       depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis_contig_ev});
-
-                return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
-            }
-        }
-        else if (is_src_f_contig &&
-                 ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-        {
-            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                size_t iter_nelems = dst_nelems;
-
-                constexpr py::ssize_t zero_offset = 0;
-
-                sycl::event sum_over_axis_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(),
-                       zero_offset, // iteration_src_offset
-                       zero_offset, // iteration_dst_offset
-                       zero_offset, // reduction_src_offset
-                       depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis_contig_ev});
-
-                return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
-            }
-        }
-    }
-
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
-
-    auto const &src_shape_vecs = src.get_shape_vector();
-    auto const &src_strides_vecs = src.get_strides_vector();
-    auto const &dst_strides_vecs = dst.get_strides_vector();
-
-    int reduction_nd = trailing_dims_to_reduce;
-    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
-    using shT = std::vector<py::ssize_t>;
-    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
-                              std::end(src_strides_vecs));
-
-    shT simplified_reduction_shape;
-    shT simplified_reduction_src_strides;
-    py::ssize_t reduction_src_offset(0);
-
-    simplify_iteration_space_1(
-        reduction_nd, reduction_shape_ptr, reduction_src_strides,
-        // output
-        simplified_reduction_shape, simplified_reduction_src_strides,
-        reduction_src_offset);
-
-    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
-
-    shT iteration_src_strides(std::begin(src_strides_vecs),
-                              std::begin(src_strides_vecs) + iteration_nd);
-    shT const &iteration_dst_strides = dst_strides_vecs;
-
-    shT simplified_iteration_shape;
-    shT simplified_iteration_src_strides;
-    shT simplified_iteration_dst_strides;
-    py::ssize_t iteration_src_offset(0);
-    py::ssize_t iteration_dst_offset(0);
-
-    if (iteration_nd == 0) {
-        if (dst_nelems != 1) {
-            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
-        }
-        iteration_nd = 1;
-        simplified_iteration_shape.push_back(1);
-        simplified_iteration_src_strides.push_back(0);
-        simplified_iteration_dst_strides.push_back(0);
-    }
-    else {
-        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
-                                 iteration_src_strides, iteration_dst_strides,
-                                 // output
-                                 simplified_iteration_shape,
-                                 simplified_iteration_src_strides,
-                                 simplified_iteration_dst_strides,
-                                 iteration_src_offset, iteration_dst_offset);
-    }
-
-    if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) {
-        bool mat_reduce_over_axis1 = false;
-        bool mat_reduce_over_axis0 = false;
-        bool array_reduce_all_elems = false;
-        size_t iter_nelems = dst_nelems;
-
-        if (simplified_reduction_src_strides[0] == 1) {
-            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
-            mat_reduce_over_axis1 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
-                 reduction_nelems);
-        }
-        else if (static_cast<size_t>(simplified_reduction_src_strides[0]) ==
-                 iter_nelems)
-        {
-            mat_reduce_over_axis0 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (simplified_iteration_src_strides[0] == 1);
-        }
-
-        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
-            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                sycl::event sum_over_axis1_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis1_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      sum_over_axis1_contig_ev);
-            }
-        }
-        else if (mat_reduce_over_axis0) {
-            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                sycl::event sum_over_axis0_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis0_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      sum_over_axis0_contig_ev);
-            }
-        }
-    }
-
-    using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-    sum_reduction_strided_impl_fn_ptr fn = nullptr;
-
-    if (supports_atomics) {
-        fn =
-            sum_over_axis_strided_atomic_dispatch_table[src_typeid][dst_typeid];
-    }
-
-    if (fn == nullptr) {
-        // use slower reduction implementation using temporaries
-        fn = sum_over_axis_strided_temps_dispatch_table[src_typeid][dst_typeid];
-        if (fn == nullptr) {
-            throw std::runtime_error("Datatypes are not supported");
-        }
-    }
-
-    std::vector<sycl::event> host_task_events{};
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-
-    const auto &arrays_metainfo_packing_triple_ =
-        device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events,
-            // iteration metadata
-            simplified_iteration_shape, simplified_iteration_src_strides,
-            simplified_iteration_dst_strides,
-            // reduction metadata
-            simplified_reduction_shape, simplified_reduction_src_strides);
-    py::ssize_t *temp_allocation_ptr =
-        std::get<0>(arrays_metainfo_packing_triple_);
-    if (temp_allocation_ptr == nullptr) {
-        throw std::runtime_error("Unable to allocate memory on device");
-    }
-    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
-
-    py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
-    py::ssize_t *reduction_shape_stride =
-        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.resize(depends.size());
-    std::copy(depends.begin(), depends.end(), all_deps.begin());
-    all_deps.push_back(copy_metadata_ev);
-
-    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
-                      dst.get_data(), iteration_nd, iter_shape_and_strides,
-                      iteration_src_offset, iteration_dst_offset,
-                      reduction_nd, // number dimensions being reduced
-                      reduction_shape_stride, reduction_src_offset, all_deps);
-
-    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        const auto &ctx = exec_q.get_context();
-        cgh.host_task([ctx, temp_allocation_ptr] {
-            sycl::free(temp_allocation_ptr, ctx);
-        });
-    });
-    host_task_events.push_back(temp_cleanup_ev);
-
-    sycl::event keep_args_event =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(keep_args_event, comp_ev);
-}
-
-bool py_sum_over_axis_dtype_supported(const py::dtype &input_dtype,
-                                      const py::dtype &output_dtype,
-                                      const std::string &dst_usm_type,
-                                      sycl::queue &q)
-{
-    int arg_tn =
-        input_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int out_tn =
-        output_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int arg_typeid = -1;
-    int out_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
-        out_typeid = array_types.typenum_to_lookup_id(out_tn);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
-        throw std::runtime_error("Reduction type support check: lookup failed");
-    }
-
-    using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-    sum_reduction_strided_impl_fn_ptr fn = nullptr;
-
-    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
-
-    if (dst_usm_type == "device") {
-        kind = sycl::usm::alloc::device;
-    }
-    else if (dst_usm_type == "shared") {
-        kind = sycl::usm::alloc::shared;
-    }
-    else if (dst_usm_type == "host") {
-        kind = sycl::usm::alloc::host;
-    }
-    else {
-        throw py::value_error("Unrecognized `dst_usm_type` argument.");
-    }
-
-    bool supports_atomics = false;
-
-    switch (output_dtype.itemsize()) {
-    case sizeof(float):
-    {
-        supports_atomics = check_atomic_support(q, kind);
-    } break;
-    case sizeof(double):
-    {
-        constexpr bool require_atomic64 = true;
-        supports_atomics = check_atomic_support(q, kind, require_atomic64);
-    } break;
-    }
-
-    if (supports_atomics) {
-        fn =
-            sum_over_axis_strided_atomic_dispatch_table[arg_typeid][out_typeid];
-    }
-
-    if (fn == nullptr) {
-        // use slower reduction implementation using temporaries
-        fn = sum_over_axis_strided_temps_dispatch_table[arg_typeid][out_typeid];
-    }
-
-    return (fn != nullptr);
-}
-
-void populate_sum_over_axis_dispatch_table(void)
-{
-    using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr;
-    using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-    using namespace td_ns;
-
-    using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory;
-    DispatchTableBuilder<sum_reduction_strided_impl_fn_ptr,
-                         SumOverAxisAtomicStridedFactory, num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
-
-    using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory;
-    DispatchTableBuilder<sum_reduction_strided_impl_fn_ptr,
-                         SumOverAxisTempsStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
-
-    using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory;
-    DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
-                         SumOverAxis1AtomicContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
-
-    using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory;
-    DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
-                         SumOverAxis0AtomicContigFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
-}
-
-namespace py = pybind11;
-
-void init_reduction_functions(py::module_ m)
-{
-    populate_sum_over_axis_dispatch_table();
-
-    m.def("_sum_over_axis", &py_sum_over_axis, "", py::arg("src"),
-          py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_sum_over_axis_dtype_supported", &py_sum_over_axis_dtype_supported,
-          "", py::arg("arg_dtype"), py::arg("out_dtype"),
-          py::arg("dst_usm_type"), py::arg("sycl_queue"));
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
similarity index 87%
rename from dpctl/tensor/libtensor/source/tensor_py.cpp
rename to dpctl/tensor/libtensor/source/tensor_ctors.cpp
index 2ce7c72add..4720f6baa1 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
@@ -1,4 +1,5 @@
-//===-- tensor_py.cpp - Implementation of _tensor_impl module  --*-C++-*-/===//
+//===-- tensor_ctors.cpp -                                    ---*-C++-*-/===//
+//   Implementation of _tensor_impl module
 //
 //                      Data Parallel Control (dpctl)
 //
@@ -30,25 +31,24 @@
 #include <pybind11/stl.h>
 #include <thread>
 #include <type_traits>
+#include <utility>
 
 #include "dpctl4pybind11.hpp"
 
 #include "accumulators.hpp"
 #include "boolean_advanced_indexing.hpp"
-#include "boolean_reductions.hpp"
+#include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
 #include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
-#include "elementwise_functions.hpp"
 #include "eye_ctor.hpp"
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "linear_sequences.hpp"
 #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
-#include "sum_reductions.hpp"
 #include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
@@ -115,6 +115,9 @@ using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 using dpctl::tensor::py_internal::py_where;
 
+/* =========================== Clip ============================== */
+using dpctl::tensor::py_internal::py_clip;
+
 // populate dispatch tables
 void init_dispatch_tables(void)
 {
@@ -147,6 +150,8 @@ void init_dispatch_vectors(void)
     populate_cumsum_1d_dispatch_vectors();
     init_repeat_dispatch_vectors();
 
+    init_clip_dispatch_vectors();
+
     return;
 }
 
@@ -402,15 +407,49 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    m.def("_repeat_by_sequence", &py_repeat_by_sequence, "", py::arg("src"),
+    auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
+                              const dpctl::tensor::usm_ndarray &dst,
+                              const dpctl::tensor::usm_ndarray &reps,
+                              const dpctl::tensor::usm_ndarray &cumsum,
+                              std::optional<int> axis, sycl::queue &exec_q,
+                              const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(),
+                                         exec_q, depends);
+        }
+        else {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
+                                         depends);
+        }
+    };
+    m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
           py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    m.def("_repeat_by_scalar", &py_repeat_by_scalar, "", py::arg("src"),
-          py::arg("dst"), py::arg("reps"), py::arg("axis"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
+    auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
+                            const dpctl::tensor::usm_ndarray &dst,
+                            const py::ssize_t reps, std::optional<int> axis,
+                            sycl::queue &exec_q,
+                            const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
+                                       depends);
+        }
+        else {
+            return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
+        }
+    };
+    m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
+          py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    dpctl::tensor::py_internal::init_elementwise_functions(m);
-    dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
-    dpctl::tensor::py_internal::init_reduction_functions(m);
+    m.def("_clip", &py_clip,
+          "Clamps elements of array `x` to the range "
+          "[`min`, `max] and writes the result to the "
+          "array `dst` for each element of `x`, `min`, and `max`."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 }
diff --git a/dpctl/tensor/libtensor/source/tensor_elementwise.cpp b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp
new file mode 100644
index 0000000000..1a86526893
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp
@@ -0,0 +1,34 @@
+//===-- tensor_elementwise.cpp                                ---*-C++-*-/===//
+//    Implementation of _tensor_elementwise_impl module
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include "elementwise_functions/elementwise_common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_elementwise_impl, m)
+{
+    dpctl::tensor::py_internal::init_elementwise_functions(m);
+}
diff --git a/dpctl/tensor/libtensor/source/tensor_reductions.cpp b/dpctl/tensor/libtensor/source/tensor_reductions.cpp
new file mode 100644
index 0000000000..138c31f3eb
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/tensor_reductions.cpp
@@ -0,0 +1,37 @@
+//===-- tensor_reductions.cpp -                              --*-C++-*-/===//
+//   Implementation of _tensor_reductions_impl module
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "boolean_reductions.hpp"
+#include "reductions/reduction_common.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_reductions_impl, m)
+{
+    dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
+    dpctl::tensor::py_internal::init_reduction_functions(m);
+}
diff --git a/dpctl/tests/elementwise/test_bitwise_and.py b/dpctl/tests/elementwise/test_bitwise_and.py
index b3a5bd665b..824e319709 100644
--- a/dpctl/tests/elementwise/test_bitwise_and.py
+++ b/dpctl/tests/elementwise/test_bitwise_and.py
@@ -18,6 +18,7 @@
 import pytest
 
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 from .utils import _integral_dtypes
@@ -85,3 +86,58 @@ def test_bitwise_and_bool():
     r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
 
     assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_and_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X &= False
+    else:
+        X &= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 &= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 &= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(TypeError):
+            ar1 &= ar2
+            dpt.bitwise_and(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_and(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_and(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(TypeError):
+            dpt.bitwise_and(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py
index cee1019353..06684ac13b 100644
--- a/dpctl/tests/elementwise/test_bitwise_left_shift.py
+++ b/dpctl/tests/elementwise/test_bitwise_left_shift.py
@@ -18,6 +18,7 @@
 import pytest
 
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 from .utils import _integral_dtypes
@@ -97,3 +98,54 @@ def test_bitwise_left_shift_range(op_dtype):
 
     z = dpt.bitwise_left_shift(x, y)
     assert dpt.all(dpt.equal(z, 0))
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_bitwise_left_shift_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    X <<= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 <<= ar2
+        assert dpt.all(ar1 == 2)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 <<= ar4
+        assert dpt.all(ar3 == 2)
+    else:
+        with pytest.raises(TypeError):
+            ar1 <<= ar2
+            dpt.bitwise_left_shift(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_left_shift(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 2)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_left_shift(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 2)
+    else:
+        with pytest.raises(TypeError):
+            dpt.bitwise_left_shift(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_bitwise_or.py b/dpctl/tests/elementwise/test_bitwise_or.py
index d273bd1507..49949cb795 100644
--- a/dpctl/tests/elementwise/test_bitwise_or.py
+++ b/dpctl/tests/elementwise/test_bitwise_or.py
@@ -18,6 +18,7 @@
 import pytest
 
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 from .utils import _integral_dtypes
@@ -85,3 +86,58 @@ def test_bitwise_or_bool():
     r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
 
     assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_or_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X |= False
+    else:
+        X |= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 |= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 |= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(TypeError):
+            ar1 |= ar2
+            dpt.bitwise_or(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_or(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_or(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(TypeError):
+            dpt.bitwise_or(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_bitwise_right_shift.py b/dpctl/tests/elementwise/test_bitwise_right_shift.py
index ceadb9414d..37112133db 100644
--- a/dpctl/tests/elementwise/test_bitwise_right_shift.py
+++ b/dpctl/tests/elementwise/test_bitwise_right_shift.py
@@ -18,6 +18,7 @@
 import pytest
 
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 from .utils import _integral_dtypes
@@ -97,3 +98,54 @@ def test_bitwise_right_shift_range(op_dtype):
 
     z = dpt.bitwise_right_shift(x, y)
     assert dpt.all(dpt.equal(z, 0))
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_bitwise_right_shift_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    X >>= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 >>= ar2
+        assert dpt.all(ar1 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 >>= ar4
+        assert dpt.all(ar3 == 0)
+    else:
+        with pytest.raises(TypeError):
+            ar1 >>= ar2
+            dpt.bitwise_right_shift(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_right_shift(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_right_shift(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 0)
+    else:
+        with pytest.raises(TypeError):
+            dpt.bitwise_right_shift(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_bitwise_xor.py b/dpctl/tests/elementwise/test_bitwise_xor.py
index b2cb11bc84..e9501b642f 100644
--- a/dpctl/tests/elementwise/test_bitwise_xor.py
+++ b/dpctl/tests/elementwise/test_bitwise_xor.py
@@ -18,6 +18,7 @@
 import pytest
 
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 from .utils import _integral_dtypes
@@ -85,3 +86,58 @@ def test_bitwise_xor_bool():
     r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
 
     assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_xor_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X ^= False
+    else:
+        X ^= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 ^= ar2
+        assert dpt.all(ar1 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 ^= ar4
+        assert dpt.all(ar3 == 0)
+    else:
+        with pytest.raises(TypeError):
+            ar1 ^= ar2
+            dpt.bitwise_xor(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_xor(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_xor(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 0)
+    else:
+        with pytest.raises(TypeError):
+            dpt.bitwise_xor(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_cbrt.py b/dpctl/tests/elementwise/test_cbrt.py
new file mode 100644
index 0000000000..b06a8d19cf
--- /dev/null
+++ b/dpctl/tests/elementwise/test_cbrt.py
@@ -0,0 +1,79 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_cbrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.cbrt(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_cbrt_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.cbrt(X)
+    expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    tol = dpt.finfo(dpt.float32).resolution
+
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_copysign.py b/dpctl/tests/elementwise/test_copysign.py
new file mode 100644
index 0000000000..26a285343c
--- /dev/null
+++ b/dpctl/tests/elementwise/test_copysign.py
@@ -0,0 +1,111 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _compare_dtypes, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_copysign_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.copysign(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.copysign(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _real_fp_dtypes)
+def test_copysign_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.copysign(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.copysign(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", _real_fp_dtypes)
+def test_copysign(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(100, dtype=dt, sycl_queue=q)
+    x[1::2] *= -1
+    y = dpt.ones(100, dtype=dt, sycl_queue=q)
+    y[::2] *= -1
+    res = dpt.copysign(x, y)
+    expected = dpt.negative(x)
+    tol = dpt.finfo(dt).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+def test_copysign_special_values():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4")
+    y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4")
+    res = dpt.copysign(x1, y1)
+    assert dpt.all(dpt.signbit(res))
+    x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4")
+    res = dpt.copysign(x2, y1)
+    assert dpt.all(dpt.signbit(res))
+    y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4")
+    res = dpt.copysign(x2, y2)
+    assert not dpt.any(dpt.signbit(res))
+    res = dpt.copysign(x1, y2)
+    assert not dpt.any(dpt.signbit(res))
diff --git a/dpctl/tests/elementwise/test_divide.py b/dpctl/tests/elementwise/test_divide.py
index 41aac736d7..a54060792c 100644
--- a/dpctl/tests/elementwise/test_divide.py
+++ b/dpctl/tests/elementwise/test_divide.py
@@ -21,9 +21,16 @@
 
 import dpctl
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _complex_fp_dtypes,
+    _real_fp_dtypes,
+    _usm_types,
+)
 
 
 @pytest.mark.parametrize("op1_dtype", _all_dtypes)
@@ -187,3 +194,65 @@ def __sycl_usm_array_interface__(self):
     c = Canary()
     with pytest.raises(ValueError):
         dpt.divide(a, c)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes + _complex_fp_dtypes)
+def test_divide_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "f":
+        X /= float(1)
+    elif dt_kind == "c":
+        X /= complex(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # out array only valid if it is inexact
+    if (
+        _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64)
+        and dpt.dtype(op1_dtype).kind in "fc"
+    ):
+        ar1 /= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 /= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(TypeError):
+            ar1 /= ar2
+            dpt.divide(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if (
+        _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64)
+        and dpt.dtype(op2_dtype).kind in "fc"
+    ):
+        dpt.divide(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.divide(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(TypeError):
+            dpt.divide(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_elementwise_classes.py b/dpctl/tests/elementwise/test_elementwise_classes.py
new file mode 100644
index 0000000000..b7f1d26d6e
--- /dev/null
+++ b/dpctl/tests/elementwise/test_elementwise_classes.py
@@ -0,0 +1,80 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import dpctl.tensor as dpt
+
+unary_fn = dpt.negative
+binary_fn = dpt.divide
+
+
+def test_unary_class_getters():
+    fn = unary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = unary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+
+def test_unary_class_types_property():
+    loop_types = unary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_unary_class_str_repr():
+    s = str(unary_fn)
+    r = repr(unary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = unary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r
+
+
+def test_binary_class_getters():
+    fn = binary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_implementation_inplace_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_promotion_path_acceptance_function()
+    assert callable(fn)
+
+
+def test_binary_class_types_property():
+    loop_types = binary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_binary_class_str_repr():
+    s = str(binary_fn)
+    r = repr(binary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = binary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r
diff --git a/dpctl/tests/elementwise/test_exp2.py b/dpctl/tests/elementwise/test_exp2.py
new file mode 100644
index 0000000000..d4bef1efab
--- /dev/null
+++ b/dpctl/tests/elementwise/test_exp2.py
@@ -0,0 +1,168 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.exp2(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_exp2_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    Y = dpt.exp2(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4))
+    expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.exp2(U, order=ord)
+            expected_Y = np.exp2(dpt.asnumpy(U))
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_exp2_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex variant
+    num_finite = 1.0
+    vals = [
+        complex(0.0, 0.0),
+        complex(num_finite, dpt.inf),
+        complex(num_finite, dpt.nan),
+        complex(dpt.inf, 0.0),
+        complex(-dpt.inf, num_finite),
+        complex(dpt.inf, num_finite),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 0.0),
+        complex(dpt.nan, num_finite),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(1.0, 0.0),
+            c_nan,
+            c_nan,
+            complex(np.inf, 0.0),
+            0.0,
+            np.inf * cis_1,
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(np.nan, 0.0),
+            c_nan,
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py
index c8ba5e80f1..b57c006cdf 100644
--- a/dpctl/tests/elementwise/test_floor_divide.py
+++ b/dpctl/tests/elementwise/test_floor_divide.py
@@ -21,13 +21,19 @@
 
 import dpctl
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
+from .utils import (
+    _compare_dtypes,
+    _integral_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
 
 
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
 def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(op1_dtype, q)
@@ -133,7 +139,7 @@ def test_floor_divide_broadcasting():
     assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
 
 
-@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
 def test_floor_divide_python_scalar(arr_dt):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(arr_dt, q)
@@ -204,7 +210,7 @@ def test_floor_divide_gh_1247():
     )
 
 
-@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:9])
+@pytest.mark.parametrize("dtype", _integral_dtypes)
 def test_floor_divide_integer_zero(dtype):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
@@ -255,3 +261,59 @@ def test_floor_divide_special_cases():
     res = dpt.floor_divide(x, y)
     res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y))
     np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_divide_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X //= int(1)
+    elif dt_kind == "f":
+        X //= float(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # out array only valid if it is inexact
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 //= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 //= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(TypeError):
+            ar1 //= ar2
+            dpt.floor_divide(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.floor_divide(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.floor_divide(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(TypeError):
+            dpt.floor_divide(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_pow.py b/dpctl/tests/elementwise/test_pow.py
index 1f13e2b533..8b76e3a9fc 100644
--- a/dpctl/tests/elementwise/test_pow.py
+++ b/dpctl/tests/elementwise/test_pow.py
@@ -21,6 +21,7 @@
 
 import dpctl
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 from .utils import _all_dtypes, _compare_dtypes, _usm_types
@@ -152,3 +153,60 @@ def test_pow_python_scalar(arr_dt):
         assert isinstance(R, dpt.usm_ndarray)
         R = dpt.pow(sc, X)
         assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_pow_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X **= int(1)
+    elif dt_kind == "f":
+        X **= float(1)
+    elif dt_kind == "c":
+        X **= complex(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 **= ar2
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] *= ar4[::2]
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype)
+        ).all()
+
+    else:
+        with pytest.raises(TypeError):
+            ar1 **= ar2
+
+
+def test_pow_inplace_basic():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    expected = dpt.square(x)
+    x **= 2
+
+    assert dpt.all(x == expected)
diff --git a/dpctl/tests/elementwise/test_remainder.py b/dpctl/tests/elementwise/test_remainder.py
index def594f269..47500954a2 100644
--- a/dpctl/tests/elementwise/test_remainder.py
+++ b/dpctl/tests/elementwise/test_remainder.py
@@ -21,6 +21,7 @@
 
 import dpctl
 import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
@@ -206,3 +207,54 @@ def test_remainder_python_scalar(arr_dt):
         assert isinstance(R, dpt.usm_ndarray)
         R = dpt.remainder(sc, X)
         assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_remainder_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X %= int(1)
+    elif dt_kind == "f":
+        X %= float(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 %= ar2
+        assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype))
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] %= ar4[::2]
+        assert dpt.all(ar3 == dpt.zeros(ar3.shape, dtype=ar3.dtype))
+
+    else:
+        with pytest.raises(TypeError):
+            ar1 %= ar2
+
+
+def test_remainder_inplace_basic():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    expected = x & 1
+    x %= 2
+
+    assert dpt.all(x == expected)
diff --git a/dpctl/tests/elementwise/test_rsqrt.py b/dpctl/tests/elementwise/test_rsqrt.py
new file mode 100644
index 0000000000..ef9378ade2
--- /dev/null
+++ b/dpctl/tests/elementwise/test_rsqrt.py
@@ -0,0 +1,74 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_rsqrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.rsqrt(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+def test_rsqrt_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.rsqrt(x)
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4"
+    )
+    assert dpt.allclose(res, expected, equal_nan=True)
diff --git a/dpctl/tests/test_sycl_event.py b/dpctl/tests/test_sycl_event.py
index fa496d1bb8..7f0db07539 100644
--- a/dpctl/tests/test_sycl_event.py
+++ b/dpctl/tests/test_sycl_event.py
@@ -202,7 +202,12 @@ def test_sycl_timer():
         m1.copy_from_device(m2)
         # host operation
         [x**2 for x in range(128 * 1024)]
-    host_dt, device_dt = timer.dt
+    elapsed = timer.dt
+    host_dt, device_dt = elapsed
+    assert isinstance(repr(elapsed), str)
+    assert isinstance(str(elapsed), str)
+    assert host_dt == elapsed.host_dt
+    assert device_dt == elapsed.device_dt
     assert host_dt > device_dt or (host_dt > 0 and device_dt >= 0)
     q_no_profiling = dpctl.SyclQueue()
     assert q_no_profiling.has_enable_profiling is False
diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py
index d15f5c8e2b..697af32f5c 100644
--- a/dpctl/tests/test_sycl_kernel_submit.py
+++ b/dpctl/tests/test_sycl_kernel_submit.py
@@ -114,7 +114,7 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
         )
 
 
-def test_async_submit():
+def test_submit_async():
     try:
         q = dpctl.SyclQueue("opencl")
     except dpctl.SyclQueueCreationError:
@@ -182,7 +182,7 @@ def test_async_submit():
 
     async_detected = False
     for attempt in range(5):
-        e1 = q.submit(
+        e1 = q.submit_async(
             kern1Kernel,
             [
                 first_row,
@@ -192,7 +192,7 @@ def test_async_submit():
                 n,
             ],
         )
-        e2 = q.submit(
+        e2 = q.submit_async(
             kern2Kernel,
             [
                 second_row,
@@ -202,7 +202,7 @@ def test_async_submit():
                 n,
             ],
         )
-        e3 = q.submit(
+        e3 = q.submit_async(
             kern3Kernel,
             [third_row, first_row, second_row],
             [
@@ -214,6 +214,9 @@ def test_async_submit():
         e3_st = e3.execution_status
         e2_st = e2.execution_status
         e1_st = e1.execution_status
+        ht_e = q._submit_keep_args_alive(
+            [first_row, second_row, third_row], [e1, e2, e3]
+        )
         are_complete = [
             e == status_complete
             for e in (
@@ -223,6 +226,7 @@ def test_async_submit():
             )
         ]
         e3.wait()
+        ht_e.wait()
         if not all(are_complete):
             async_detected = True
             break
diff --git a/dpctl/tests/test_sycl_queue_memcpy.py b/dpctl/tests/test_sycl_queue_memcpy.py
index 45c8e41f61..e678b73f03 100644
--- a/dpctl/tests/test_sycl_queue_memcpy.py
+++ b/dpctl/tests/test_sycl_queue_memcpy.py
@@ -44,7 +44,77 @@ def test_memcpy_copy_usm_to_usm():
 
     q.memcpy(mobj2, mobj1, 3)
 
-    assert mv2[:3], b"123"
+    assert mv2[:3] == b"123"
+
+
+def test_memcpy_copy_host_to_usm():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Default constructor for SyclQueue failed")
+    usm_obj = _create_memory(q)
+
+    canary = bytearray(b"123456789")
+    host_obj = memoryview(canary)
+
+    q.memcpy(usm_obj, host_obj, len(canary))
+
+    mv2 = memoryview(usm_obj)
+
+    assert mv2[: len(canary)] == canary
+
+
+def test_memcpy_copy_usm_to_host():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Default constructor for SyclQueue failed")
+    usm_obj = _create_memory(q)
+    mv2 = memoryview(usm_obj)
+
+    n = 9
+    for id in range(n):
+        mv2[id] = ord("a") + id
+
+    host_obj = bytearray(b" " * n)
+
+    q.memcpy(host_obj, usm_obj, n)
+
+    assert host_obj == b"abcdefghi"
+
+
+def test_memcpy_copy_host_to_host():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Default constructor for SyclQueue failed")
+
+    src_buf = b"abcdefghijklmnopqrstuvwxyz"
+    dst_buf = bytearray(len(src_buf))
+
+    q.memcpy(dst_buf, src_buf, len(src_buf))
+
+    assert dst_buf == src_buf
+
+
+def test_memcpy_async():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Default constructor for SyclQueue failed")
+
+    src_buf = b"abcdefghijklmnopqrstuvwxyz"
+    n = len(src_buf)
+    dst_buf = bytearray(n)
+    dst_buf2 = bytearray(n)
+
+    e = q.memcpy_async(dst_buf, src_buf, n)
+    e2 = q.memcpy_async(dst_buf2, src_buf, n, [e])
+
+    e.wait()
+    e2.wait()
+    assert dst_buf == src_buf
+    assert dst_buf2 == src_buf
 
 
 def test_memcpy_type_error():
@@ -56,8 +126,8 @@ def test_memcpy_type_error():
 
     with pytest.raises(TypeError) as cm:
         q.memcpy(None, mobj, 3)
-    assert "`dest`" in str(cm.value)
+    assert "_Memory" in str(cm.value)
 
     with pytest.raises(TypeError) as cm:
         q.memcpy(mobj, None, 3)
-    assert "`src`" in str(cm.value)
+    assert "_Memory" in str(cm.value)
diff --git a/dpctl/tests/test_tensor_array_api_inspection.py b/dpctl/tests/test_tensor_array_api_inspection.py
new file mode 100644
index 0000000000..5ae0d35f8e
--- /dev/null
+++ b/dpctl/tests/test_tensor_array_api_inspection.py
@@ -0,0 +1,163 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tensor._tensor_impl import (
+    default_device_complex_type,
+    default_device_fp_type,
+    default_device_index_type,
+    default_device_int_type,
+)
+
+_dtypes_no_fp16_fp64 = {
+    "bool": dpt.bool,
+    "float32": dpt.float32,
+    "complex64": dpt.complex64,
+    "complex128": dpt.complex128,
+    "int8": dpt.int8,
+    "int16": dpt.int16,
+    "int32": dpt.int32,
+    "int64": dpt.int64,
+    "uint8": dpt.uint8,
+    "uint16": dpt.uint16,
+    "uint32": dpt.uint32,
+    "uint64": dpt.uint64,
+}
+
+
+class MockDevice:
+    def __init__(self, fp16: bool, fp64: bool):
+        self.has_aspect_fp16 = fp16
+        self.has_aspect_fp64 = fp64
+
+
+def test_array_api_inspection_methods():
+    info = dpt.__array_namespace_info__()
+    assert info.capabilities()
+    assert info.default_device()
+    assert info.default_dtypes()
+    assert info.devices()
+    assert info.dtypes()
+
+
+def test_array_api_inspection_default_device():
+    assert (
+        dpt.__array_namespace_info__().default_device()
+        == dpctl.select_default_device()
+    )
+
+
+def test_array_api_inspection_devices():
+    devices1 = dpt.__array_namespace_info__().devices()
+    devices2 = dpctl.get_devices()
+    assert len(devices1) == len(devices2)
+    assert devices1 == devices2
+
+
+def test_array_api_inspection_capabilities():
+    capabilities = dpt.__array_namespace_info__().capabilities()
+    assert capabilities["boolean_indexing"]
+    assert capabilities["data_dependent_shapes"]
+
+
+def test_array_api_inspection_default_dtypes():
+    dev = dpctl.select_default_device()
+
+    int_dt = default_device_int_type(dev)
+    ind_dt = default_device_index_type(dev)
+    fp_dt = default_device_fp_type(dev)
+    cm_dt = default_device_complex_type(dev)
+
+    info = dpt.__array_namespace_info__()
+    default_dts_nodev = info.default_dtypes()
+    default_dts_dev = info.default_dtypes(dev)
+
+    assert (
+        int_dt == default_dts_nodev["integral"] == default_dts_dev["integral"]
+    )
+    assert (
+        ind_dt == default_dts_nodev["indexing"] == default_dts_dev["indexing"]
+    )
+    assert (
+        fp_dt
+        == default_dts_nodev["real floating"]
+        == default_dts_dev["real floating"]
+    )
+    assert (
+        cm_dt
+        == default_dts_nodev["complex floating"]
+        == default_dts_dev["complex floating"]
+    )
+
+
+def test_array_api_inspection_default_device_dtypes():
+    dev = dpctl.select_default_device()
+    dtypes = _dtypes_no_fp16_fp64.copy()
+    if dev.has_aspect_fp64:
+        dtypes["float64"] = dpt.float64
+
+    assert dtypes == dpt.__array_namespace_info__().dtypes()
+
+
+@pytest.mark.parametrize("fp16", [True, False])
+@pytest.mark.parametrize("fp64", [True, False])
+def test_array_api_inspection_device_dtypes(fp16, fp64):
+    dev = MockDevice(fp16, fp64)
+    dtypes = _dtypes_no_fp16_fp64.copy()
+    if fp64:
+        dtypes["float64"] = dpt.float64
+
+    assert dtypes == dpt.__array_namespace_info__().dtypes(device=dev)
+
+
+def test_array_api_inspection_dtype_kind():
+    info = dpt.__array_namespace_info__()
+
+    f_dtypes = info.dtypes(kind="real floating")
+    assert all([_dt[1].kind == "f" for _dt in f_dtypes.items()])
+
+    i_dtypes = info.dtypes(kind="signed integer")
+    assert all([_dt[1].kind == "i" for _dt in i_dtypes.items()])
+
+    u_dtypes = info.dtypes(kind="unsigned integer")
+    assert all([_dt[1].kind == "u" for _dt in u_dtypes.items()])
+
+    ui_dtypes = info.dtypes(kind="unsigned integer")
+    assert all([_dt[1].kind in "ui" for _dt in ui_dtypes.items()])
+
+    c_dtypes = info.dtypes(kind="complex floating")
+    assert all([_dt[1].kind == "c" for _dt in c_dtypes.items()])
+
+    assert info.dtypes(kind="bool") == {"bool": dpt.bool}
+
+    _signed_ints = {
+        "int8": dpt.int8,
+        "int16": dpt.int16,
+        "int32": dpt.int32,
+        "int64": dpt.int64,
+    }
+    assert (
+        info.dtypes(kind=("signed integer", "signed integer")) == _signed_ints
+    )
+    assert (
+        info.dtypes(
+            kind=("integral", "bool", "real floating", "complex floating")
+        )
+        == info.dtypes()
+    )
diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py
new file mode 100644
index 0000000000..7050b17e7c
--- /dev/null
+++ b/dpctl/tests/test_tensor_clip.py
@@ -0,0 +1,627 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from helper import get_queue_or_skip, skip_if_dtype_not_supported
+from numpy.testing import assert_raises_regex
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
+from dpctl.utils import ExecutionPlacementError
+
+_all_dtypes = [
+    "?",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+_usm_types = ["device", "shared", "host"]
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+def test_clip_dtypes(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q)
+    ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # also covers cases where dt1 == dt2
+    if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64):
+        r = dpt.clip(ar1, ar2, ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=ar3, max=None)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=None, max=ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+    else:
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, ar2, ar3)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=ar3, max=None)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=None, max=ar3)
+
+
+def test_clip_empty():
+    get_queue_or_skip()
+
+    x = dpt.empty((2, 0, 3), dtype="i4")
+    a_min = dpt.ones((2, 0, 3), dtype="i4")
+    a_max = dpt.ones((2, 0, 3), dtype="i4")
+
+    r = dpt.clip(x, a_min, a_max)
+    assert r.size == 0
+    assert r.shape == x.shape
+
+
+def test_clip_python_scalars():
+    get_queue_or_skip()
+
+    arrs = [
+        dpt.ones(1, dtype="?"),
+        dpt.ones(1, dtype="i4"),
+        dpt.ones(1, dtype="f4"),
+        dpt.ones(1, dtype="c8"),
+    ]
+
+    py_zeros = [
+        False,
+        0,
+        0.0,
+        complex(0, 0),
+    ]
+
+    py_ones = [
+        True,
+        1,
+        1.0,
+        complex(1, 0),
+    ]
+
+    for zero, one, arr in zip(py_zeros, py_ones, arrs):
+        r = dpt.clip(arr, zero, one)
+        assert isinstance(r, dpt.usm_ndarray)
+        r = dpt.clip(arr, min=zero)
+        assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_clip_in_place():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    a_min = dpt.arange(1, 11, dtype="i4")
+    a_max = dpt.arange(2, 12, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=a_max)
+    assert dpt.all(a_max == a_min)
+
+    a_min = dpt.arange(1, 11, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=a_min[::-1])
+    assert dpt.all((x + 1)[::-1] == a_min)
+
+
+def test_clip_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="f4")
+    r = dpt.clip(x, -dpt.inf, dpt.inf)
+    assert dpt.all(r == x)
+    r = dpt.clip(x, dpt.nan, dpt.inf)
+    assert dpt.all(dpt.isnan(r))
+    r = dpt.clip(x, -dpt.inf, dpt.nan)
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_clip_out_need_temporary():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i2")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i1")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i2")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_out_need_temporary_none():
+    get_queue_or_skip()
+
+    x = dpt.full(6, 3, dtype="i4")
+    # with min/max == None
+    a_min = dpt.full(10, 2, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_arg_validation():
+    get_queue_or_skip()
+
+    check = dict()
+    x1 = dpt.empty((1,), dtype="i4")
+    x2 = dpt.empty((1,), dtype="i4")
+
+    with pytest.raises(TypeError):
+        dpt.clip(check, x1, x2)
+
+
+@pytest.mark.parametrize(
+    "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")]
+)
+def test_clip_order(dt1, dt2):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="C")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="C")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="F")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="F")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("dt", ["i4", "i2"])
+def test_clip_none_order(dt):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="C")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="F")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2]
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+@pytest.mark.parametrize("usm_type3", _usm_types)
+def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+    ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3)
+
+    r = dpt.clip(ar1, ar2, ar3)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (usm_type1, usm_type2, usm_type3)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+
+    r = dpt.clip(ar1, min=ar2, max=None)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2))
+    assert r.usm_type == expected_usm_type
+
+
+def test_clip_dtype_error():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones(1, dtype="i4")
+    ar2 = dpt.ones(1, dtype="i4")
+    ar3 = dpt.ones(1, dtype="i4")
+    ar4 = dpt.empty_like(ar1, dtype="f4")
+
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue)
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        None,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        ar2,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        1,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        1,
+        ar4,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        None,
+        ar2,
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="float32")
+    ar3 = dpt.ones_like(ar1, dtype="float32")
+    ar4 = dpt.empty(3, dtype="float32")
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+    ar1 = np.ones(2, dtype="f4")
+    ar2 = dpt.ones(2, dtype="f4")
+    ar3 = dpt.ones(2, dtype="f4")
+    assert_raises_regex(
+        TypeError,
+        "Expected `x` to be of dpctl.tensor.usm_ndarray type*",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+    )
+
+    ar1 = dpt.ones(2, dtype="i4")
+    ar2 = dpt.ones_like(ar1, dtype="i4")
+    ar3 = dpt.ones_like(ar1, dtype="i4")
+    ar4 = np.empty_like(ar1)
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_out_type_check():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10)
+    x2 = dpt.ones(10)
+    x3 = dpt.ones(10)
+
+    out = range(10)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x2, x3, out=out)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_strided(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 2 * 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2]
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    expected = expected[::-2]
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2]
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    a_max = a_max[::-2]
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+def test_clip_max_less_than_min():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    res = dpt.clip(x, 5, 0)
+    assert dpt.all(res == 0)
+
+
+def test_clip_minmax_weak_types():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype=dpt.bool)
+    min_list = [False, 0, 0.0, 0.0 + 0.0j]
+    max_list = [True, 1, 1.0, 1.0 + 0.0j]
+    for min_v, max_v in zip(min_list, max_list):
+        if isinstance(min_v, bool) and isinstance(max_v, bool):
+            y = dpt.clip(x, min_v, max_v)
+            assert isinstance(y, dpt.usm_ndarray)
+        else:
+            with pytest.raises(ValueError):
+                dpt.clip(x, min_v, max_v)
+
+
+def test_clip_max_weak_types():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="i4")
+    m = dpt.ones(10, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, m, 2.5)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 2.5, m)
diff --git a/dpctl/tests/test_tensor_statistical_functions.py b/dpctl/tests/test_tensor_statistical_functions.py
new file mode 100644
index 0000000000..8916833f86
--- /dev/null
+++ b/dpctl/tests/test_tensor_statistical_functions.py
@@ -0,0 +1,254 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tensor._tensor_impl import default_device_fp_type
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+_no_complex_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+]
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+def test_mean_dtypes(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.mean(x)
+    assert res == 1
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+@pytest.mark.parametrize("py_zero", [float(0), int(0)])
+def test_std_var_dtypes(dt, py_zero):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.std(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+    res = dpt.var(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+def test_stat_fns_axis():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1))
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 6)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1))
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+@pytest.mark.parametrize("fn", [dpt.mean, dpt.var])
+def test_stat_fns_empty(fn):
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    r = fn(x)
+    assert r.shape == tuple()
+    assert dpt.isnan(r)
+
+    x = dpt.empty((10, 0, 2), dtype="f4")
+    r = fn(x, axis=1)
+    assert r.shape == (10, 2)
+    assert dpt.all(dpt.isnan(r))
+
+    r = fn(x, axis=0)
+    assert r.shape == (0, 2)
+    assert r.size == 0
+
+
+def test_stat_fns_keepdims():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1), keepdims=True)
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+def test_stat_fns_empty_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+    m = dpt.mean(x, axis=())
+
+    assert x.shape == m.shape
+    assert dpt.all(x == m)
+
+    s = dpt.var(x, axis=())
+    assert x.shape == s.shape
+    assert dpt.all(s == 0)
+
+    d = dpt.std(x, axis=())
+    assert x.shape == d.shape
+    assert dpt.all(d == 0)
+
+
+def test_mean():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    m = dpt.mean(x)
+    expected = dpt.asarray(4, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=0)
+    expected = dpt.arange(3, 6, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=1)
+    expected = dpt.asarray([1, 4, 7], dtype="f4")
+    assert dpt.allclose(m, expected)
+
+
+def test_var_std():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    r = dpt.var(x)
+    expected = dpt.asarray(6.666666507720947, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, correction=3)
+    expected1 = dpt.asarray(10.0, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, correction=3)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=0)
+    expected = dpt.full(x.shape[1], 6, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=0, correction=1)
+    expected1 = dpt.full(x.shape[1], 9, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=0)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=0, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=1)
+    expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=1, correction=1)
+    expected1 = dpt.ones(x.shape[0], dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=1)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=1, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+
+def test_var_axis_length_correction():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+
+    r = dpt.var(x, correction=x.size)
+    assert dpt.isnan(r)
+
+    r = dpt.var(x, axis=0, correction=x.shape[0])
+    assert dpt.all(dpt.isnan(r))
+
+    r = dpt.var(x, axis=1, correction=x.shape[1])
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_stat_function_errors():
+    d = dict()
+    with pytest.raises(TypeError):
+        dpt.var(d)
+    with pytest.raises(TypeError):
+        dpt.std(d)
+    with pytest.raises(TypeError):
+        dpt.mean(d)
+
+    x = dpt.empty(1, dtype="f4")
+    with pytest.raises(TypeError):
+        dpt.var(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.mean(x, axis=d)
+
+    with pytest.raises(TypeError):
+        dpt.var(x, correction=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, correction=d)
+
+    x = dpt.empty(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.var(x)
+    with pytest.raises(ValueError):
+        dpt.std(x)
diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py
index 403a823324..33fe4a8b4f 100644
--- a/dpctl/tests/test_tensor_sum.py
+++ b/dpctl/tests/test_tensor_sum.py
@@ -14,10 +14,10 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import numpy as np
 import pytest
 
 import dpctl.tensor as dpt
+import dpctl.utils as du
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 _all_dtypes = [
@@ -36,7 +36,6 @@
     "c8",
     "c16",
 ]
-_usm_types = ["device", "shared", "host"]
 
 
 @pytest.mark.parametrize("arg_dtype", _all_dtypes)
@@ -44,6 +43,7 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(arg_dtype, q)
 
+    # test reduction for C-contiguous input
     m = dpt.ones(100, dtype=arg_dtype)
     r = dpt.sum(m)
 
@@ -56,11 +56,19 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype):
         assert r.dtype.kind == "f"
     elif m.dtype.kind == "c":
         assert r.dtype.kind == "c"
-    assert (dpt.asnumpy(r) == 100).all()
 
+    assert dpt.all(r == 100)
+
+    # test reduction for strided input
     m = dpt.ones(200, dtype=arg_dtype)[:1:-2]
     r = dpt.sum(m)
-    assert (dpt.asnumpy(r) == 99).all()
+    assert dpt.all(r == 99)
+
+    # test reduction for strided input which can be simplified
+    # to contiguous computation
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.sum(dpt.flip(m))
+    assert dpt.all(r == 100)
 
 
 @pytest.mark.parametrize("arg_dtype", _all_dtypes)
@@ -75,7 +83,7 @@ def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype):
 
     assert isinstance(r, dpt.usm_ndarray)
     assert r.dtype == dpt.dtype(out_dtype)
-    assert (dpt.asnumpy(r) == 100).all()
+    assert dpt.all(r == 100)
 
 
 def test_sum_empty():
@@ -94,7 +102,7 @@ def test_sum_axis():
 
     assert isinstance(s, dpt.usm_ndarray)
     assert s.shape == (3, 6)
-    assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all()
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4"))
 
 
 def test_sum_keepdims():
@@ -105,7 +113,7 @@ def test_sum_keepdims():
 
     assert isinstance(s, dpt.usm_ndarray)
     assert s.shape == (3, 1, 1, 6, 1)
-    assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all()
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype))
 
 
 def test_sum_scalar():
@@ -117,7 +125,7 @@ def test_sum_scalar():
     assert isinstance(s, dpt.usm_ndarray)
     assert m.sycl_queue == s.sycl_queue
     assert s.shape == ()
-    assert dpt.asnumpy(s) == np.full((), 1)
+    assert s == dpt.full((), 1)
 
 
 @pytest.mark.parametrize("arg_dtype", _all_dtypes)
@@ -132,7 +140,7 @@ def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype):
 
     assert isinstance(r, dpt.usm_ndarray)
     assert r.dtype == dpt.dtype(out_dtype)
-    assert dpt.asnumpy(r) == 1
+    assert r == 1
 
 
 def test_sum_keepdims_zero_size():
@@ -174,6 +182,21 @@ def test_largish_reduction(arg_dtype, n):
     assert dpt.all(dpt.equal(y1, n * m))
 
 
+@pytest.mark.parametrize("n", [1023, 1024, 1025])
+def test_largish_reduction_axis1_axis0(n):
+    get_queue_or_skip()
+
+    m = 25
+    x1 = dpt.ones((m, n), dtype="f4")
+    x2 = dpt.ones((n, m), dtype="f4")
+
+    y1 = dpt.sum(x1, axis=1)
+    y2 = dpt.sum(x2, axis=0)
+
+    assert dpt.all(y1 == n)
+    assert dpt.all(y2 == n)
+
+
 def test_axis0_bug():
     "gh-1391"
     get_queue_or_skip()
@@ -187,3 +210,131 @@ def test_axis0_bug():
     expected = dpt.asarray([[0, 3], [1, 4], [2, 5]])
 
     assert dpt.all(s == expected)
+
+
+def test_sum_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    # The atomic case is checked in `test_usm_ndarray_reductions`
+    # This test checks the tree reduction path for correctness
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+
+    m = dpt.sum(x, axis=0)
+    expected = dpt.asarray(
+        [
+            [60, 63, 66, 69, 72],
+            [75, 78, 81, 84, 87],
+            [90, 93, 96, 99, 102],
+            [105, 108, 111, 114, 117],
+        ],
+        dtype="f4",
+    )
+    tol = dpt.finfo(m.dtype).resolution
+    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.sum(x, axis=2)
+    expected = dpt.asarray(
+        [[10, 35, 60, 85], [110, 135, 160, 185], [210, 235, 260, 285]],
+        dtype="f4",
+    )
+    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
+
+
+def _any_complex(dtypes):
+    return any(dpt.isdtype(dpt.dtype(dt), "complex floating") for dt in dtypes)
+
+
+def _skip_on_this_device(sycl_dev):
+    device_mask = du.intel_device_info(sycl_dev).get("device_id", 0) & 0xFF00
+    return device_mask in [0x3E00, 0x9B00]
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:])
+def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    arg_dtype = dpt.dtype(arg_dtype)
+    if _any_complex((arg_dtype,)):
+        if _skip_on_this_device(q.sycl_device):
+            pytest.skip(
+                "Product reduction for complex output are known "
+                "to fail for Gen9 with 2024.0 compiler"
+            )
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if m.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif m.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif m.dtype.kind == "f":
+        assert r.dtype.kind == "f"
+    elif m.dtype.kind == "c":
+        assert r.dtype.kind == "c"
+    assert dpt.all(r == 1)
+
+    if dpt.isdtype(m.dtype, "unsigned integer"):
+        m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(512, dtype=r.dtype))
+    else:
+        m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype))
+
+
+def test_prod_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="u1")
+    y = dpt.prod(x)
+    assert y.shape == tuple()
+    assert int(y) == 1
+
+
+def test_prod_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.prod(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.all(s == dpt.asarray(1, dtype="i4"))
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    out_dtype = dpt.dtype(out_dtype)
+    arg_dtype = dpt.dtype(arg_dtype)
+    if _any_complex((arg_dtype, out_dtype)):
+        if _skip_on_this_device(q.sycl_device):
+            pytest.skip(
+                "Product reduction for complex output are known "
+                "to fail for Gen9 with 2024.0 compiler"
+            )
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert dpt.all(r == 1)
+
+
+def test_gh_1468():
+    "See https://github.com/IntelPython/dpctl/issues/1468"
+    get_queue_or_skip()
+
+    a = dpt.full((2, 3, 4), 123456789, dtype=dpt.int32)
+    t = dpt.sum(a, dtype="f4")
+    assert t > 0
diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 72f5aabebb..095bbc5638 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -39,6 +39,7 @@
         (2, 5, 2),
         (2, 2, 2, 2, 2, 2, 2, 2),
         5,
+        np.int32(7),
     ],
 )
 @pytest.mark.parametrize("usm_type", ["shared", "host", "device"])
diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py
index 2126727d5b..f3704274d4 100644
--- a/dpctl/tests/test_usm_ndarray_manipulation.py
+++ b/dpctl/tests/test_usm_ndarray_manipulation.py
@@ -1170,6 +1170,12 @@ def test_repeat_axes():
     res = dpt.repeat(x, reps, axis=1)
     assert dpt.all(res == expected_res)
 
+    x = dpt.arange(10, dtype="i4")
+    expected_res = dpt.empty(x.shape[0] * reps, x.dtype)
+    expected_res[::2], expected_res[1::2] = x, x
+    res = dpt.repeat(x, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
 
 def test_repeat_size_0_outputs():
     get_queue_or_skip()
@@ -1193,11 +1199,17 @@ def test_repeat_size_0_outputs():
     assert res.size == 0
     assert res.shape == (3, 0, 5)
 
-    x = dpt.ones((3, 2, 5))
     res = dpt.repeat(x, (0, 0), axis=1)
     assert res.size == 0
     assert res.shape == (3, 0, 5)
 
+    # axis=None cases
+    res = dpt.repeat(x, 0)
+    assert res.size == 0
+
+    res = dpt.repeat(x, (0,) * x.size)
+    assert res.size == 0
+
 
 def test_repeat_strides():
     get_queue_or_skip()
@@ -1220,6 +1232,17 @@ def test_repeat_strides():
     res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0)
     assert dpt.all(res == expected_res)
 
+    # axis=None
+    x = dpt.reshape(dpt.arange(10 * 10), (10, 10))
+    x1 = dpt.reshape(x[::-2, :], -1)
+    x2 = x[::-2, :]
+    expected_res = dpt.empty(10 * 10, dtype="i4")
+    expected_res[::2], expected_res[1::2] = x1, x1
+    res = dpt.repeat(x2, reps)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x2, (reps,) * x1.size)
+    assert dpt.all(res == expected_res)
+
 
 def test_repeat_casting():
     get_queue_or_skip()
@@ -1256,11 +1279,6 @@ def test_repeat_arg_validation():
     with pytest.raises(ValueError):
         dpt.repeat(x, 2, axis=1)
 
-    # x.ndim cannot be > 1 for axis=None
-    x = dpt.empty((5, 10))
-    with pytest.raises(ValueError):
-        dpt.repeat(x, 2, axis=None)
-
     # repeats must be positive
     x = dpt.empty(5)
     with pytest.raises(ValueError):
diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
new file mode 100644
index 0000000000..0969822e6d
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_reductions.py
@@ -0,0 +1,499 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from random import randrange
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+_no_complex_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+]
+
+
+_all_dtypes = _no_complex_dtypes + [
+    "c8",
+    "c16",
+]
+
+
+def test_max_min_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(
+        dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7)
+    )
+
+    m = dpt.max(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, -1, -1, :, -1])
+
+    m = dpt.min(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, 0, 0, :, 0])
+
+
+def test_max_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5), (3, 4, 5))
+
+    m = dpt.max(x, axis=0)
+    assert dpt.all(m == x[-1, :, :])
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.max(x, axis=2)
+    assert dpt.all(m == x[:, :, 0])
+
+
+def test_reduction_keepdims():
+    get_queue_or_skip()
+
+    n0, n1 = 3, 6
+    x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4")
+    m = dpt.max(x, axis=(1, 2, -1), keepdims=True)
+
+    xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1))
+    p = dpt.argmax(xx, axis=-1, keepdims=True)
+
+    assert m.shape == (n0, 1, 1, n1, 1)
+    assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape))
+    assert dpt.all(p == 0)
+
+
+def test_max_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.max(x)
+
+    assert m.shape == ()
+    assert x == m
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 3
+    x[:, x.shape[1] // 2] = 3
+
+    m = dpt.max(x)
+    assert m == 3
+    m = dpt.max(x, axis=0)
+    assert dpt.all(m == 3)
+    m = dpt.max(x, axis=1)
+    assert dpt.all(m == 3)
+
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 0
+    x[:, x.shape[1] // 2] = 0
+
+    m = dpt.min(x)
+    assert m == 0
+    m = dpt.min(x, axis=0)
+    assert dpt.all(m == 0)
+    m = dpt.min(x, axis=1)
+    assert dpt.all(m == 0)
+
+
+def test_max_min_nan_propagation():
+    get_queue_or_skip()
+
+    # float, finites
+    x = dpt.arange(4, dtype="f4")
+    x[0] = dpt.nan
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    # float, infinities
+    x[1:] = dpt.inf
+    assert dpt.isnan(dpt.max(x))
+    x[1:] = -dpt.inf
+    assert dpt.isnan(dpt.min(x))
+
+    # complex
+    x = dpt.arange(4, dtype="c8")
+    x[0] = complex(dpt.nan, 0)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    x[0] = complex(0, dpt.nan)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+
+def test_argmax_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.argmax(x)
+
+    assert m.shape == ()
+    assert m == 0
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_search_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x_shape = (24, 1024)
+    x_size = np.prod(x_shape)
+    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, x_shape)
+    x[idx] = 2
+
+    m = dpt.argmax(x)
+    assert m == idx
+
+    # test case of strided input mapping to contig
+    # implementation
+    m = dpt.argmax(dpt.flip(x))
+    assert m == x.size - 1 - idx
+
+    # test case of strided implementation
+    y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q)
+    y[::2] = x
+    m = dpt.argmax(y)
+    assert m == 2 * idx
+
+    x = dpt.reshape(x, x_shape)
+
+    x[idx_tup[0], :] = 3
+    m = dpt.argmax(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = 4
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = 5
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx)
+
+    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, x_shape)
+    x[idx] = 0
+
+    m = dpt.argmin(x)
+    assert m == idx
+
+    x = dpt.reshape(x, x_shape)
+
+    x[idx_tup[0], :] = -1
+    m = dpt.argmin(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = -2
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = -3
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx)
+
+
+def test_argmax_argmin_nan_propagation():
+    get_queue_or_skip()
+
+    sz = 4
+    idx = randrange(sz)
+    # floats
+    x = dpt.arange(sz, dtype="f4")
+    x[idx] = dpt.nan
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    # complex
+    x = dpt.arange(sz, dtype="c8")
+    x[idx] = complex(dpt.nan, 0)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    x[idx] = complex(0, dpt.nan)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+
+def test_argmax_argmin_identities():
+    # make sure that identity arrays work as expected
+    get_queue_or_skip()
+
+    x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4")
+    assert dpt.argmax(x) == 0
+    x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4")
+    assert dpt.argmin(x) == 0
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_argmax_axis0_axis1(order):
+    get_queue_or_skip()
+
+    x = dpt.asarray([[1, 2, 3], [6, 5, 4]], dtype="i4", order=order)
+    assert dpt.argmax(x) == 3
+
+    res = dpt.argmax(x, axis=0)
+    expected = dpt.asarray([1, 1, 1], dtype=res.dtype)
+    assert dpt.all(res == expected)
+
+    res = dpt.argmax(x, axis=1)
+    expected = dpt.asarray([2, 0], dtype=res.dtype)
+    assert dpt.all(res == expected)
+
+
+def test_reduction_arg_validation():
+    get_queue_or_skip()
+
+    x = dict()
+    with pytest.raises(TypeError):
+        dpt.sum(x)
+    with pytest.raises(TypeError):
+        dpt.max(x)
+    with pytest.raises(TypeError):
+        dpt.argmax(x)
+
+    x = dpt.zeros((0,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.max(x)
+    with pytest.raises(ValueError):
+        dpt.argmax(x)
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+def test_logsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.logsumexp(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype.kind == "f"
+    tol = dpt.finfo(r.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(r),
+        np.logaddexp.reduce(dpt.asnumpy(m), dtype=r.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_logsumexp_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    y = dpt.logsumexp(x)
+    assert y.shape == tuple()
+    assert y == -dpt.inf
+
+
+def test_logsumexp_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    s = dpt.logsumexp(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    tol = dpt.finfo(s.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(s),
+        np.logaddexp.reduce(dpt.asnumpy(m), axis=(1, 2, -1), dtype=s.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_logsumexp_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.logsumexp(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+
+
+def test_logsumexp_keepdims():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.logsumexp(m, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+
+
+def test_logsumexp_keepdims_zero_size():
+    get_queue_or_skip()
+    n = 10
+    a = dpt.ones((n, 0, n))
+
+    s1 = dpt.logsumexp(a, keepdims=True)
+    assert s1.shape == (1, 1, 1)
+
+    s2 = dpt.logsumexp(a, axis=(0, 1), keepdims=True)
+    assert s2.shape == (1, 1, n)
+
+    s3 = dpt.logsumexp(a, axis=(1, 2), keepdims=True)
+    assert s3.shape == (n, 1, 1)
+
+    s4 = dpt.logsumexp(a, axis=(0, 2), keepdims=True)
+    assert s4.shape == (1, 0, 1)
+
+    a0 = a[0]
+    s5 = dpt.logsumexp(a0, keepdims=True)
+    assert s5.shape == (1, 1)
+
+
+def test_logsumexp_scalar():
+    get_queue_or_skip()
+
+    m = dpt.ones(())
+    s = dpt.logsumexp(m)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert m.sycl_queue == s.sycl_queue
+    assert s.shape == ()
+
+
+def test_logsumexp_complex():
+    get_queue_or_skip()
+
+    x = dpt.zeros(1, dtype="c8")
+    with pytest.raises(TypeError):
+        dpt.logsumexp(x)
+
+
+def test_logsumexp_int_axis():
+    get_queue_or_skip()
+
+    x = dpt.zeros((8, 10), dtype="f4")
+    res = dpt.logsumexp(x, axis=0)
+    assert res.ndim == 1
+    assert res.shape[0] == 10
+
+
+def test_logsumexp_invalid_arr():
+    x = dict()
+    with pytest.raises(TypeError):
+        dpt.logsumexp(x)
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+def test_hypot_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.reduce_hypot(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype.kind == "f"
+    tol = dpt.finfo(r.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(r),
+        np.hypot.reduce(dpt.asnumpy(m), dtype=r.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_hypot_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    y = dpt.reduce_hypot(x)
+    assert y.shape == tuple()
+    assert y == 0
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_hypot_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.reduce_hypot(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+
+
+def test_hypot_complex():
+    get_queue_or_skip()
+
+    x = dpt.zeros(1, dtype="c8")
+    with pytest.raises(TypeError):
+        dpt.reduce_hypot(x)
+
+
+def test_tree_reduction_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+
+    m = dpt.logsumexp(x, axis=0)
+    tol = dpt.finfo(m.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(m),
+        np.logaddexp.reduce(dpt.asnumpy(x), axis=0, dtype=m.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.logsumexp(x, axis=2)
+    assert_allclose(
+        dpt.asnumpy(m),
+        np.logaddexp.reduce(dpt.asnumpy(x), axis=2, dtype=m.dtype),
+        rtol=tol,
+        atol=tol,
+    )
diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py
index df4a9f503f..05b2dc7890 100644
--- a/dpctl/tests/test_utils.py
+++ b/dpctl/tests/test_utils.py
@@ -122,3 +122,27 @@ def test_onetrace_enabled():
     with dpctl.utils.onetrace_enabled():
         assert os.getenv(v_name, None) == "1"
     assert os.getenv(v_name, None) == v_v
+
+
+def test_intel_device_info():
+    try:
+        d = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default device could not be created")
+    descr = dpctl.utils.intel_device_info(d)
+    assert isinstance(descr, dict)
+    assert ("device_id" in descr) or not descr
+    allowed_names = [
+        "device_id",
+        "gpu_slices",
+        "gpu_eu_count",
+        "gpu_eu_simd_width",
+        "gpu_hw_threads_per_eu",
+        "gpu_subslices_per_slice",
+        "gpu_eu_count_per_subslice",
+        "max_mem_bandwidth",
+    ]
+    for descriptor_name in descr.keys():
+        test = descriptor_name in allowed_names
+        err_msg = f"Key '{descriptor_name}' is not recognized"
+        assert test, err_msg
diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt
index 11b0930052..aadc1c0fe0 100644
--- a/dpctl/utils/CMakeLists.txt
+++ b/dpctl/utils/CMakeLists.txt
@@ -4,3 +4,26 @@ foreach(_cy_file ${_cython_sources})
     get_filename_component(_trgt ${_cy_file} NAME_WLE)
     build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/utils")
 endforeach()
+
+add_custom_target(_dpctl4pybind11_header_ready
+    DEPENDS
+    _usmarray_copy_capi_include
+    _memory_copy_capi_include
+    _sycl_device_copy_capi_include
+    _sycl_queue_copy_capi_include
+    _sycl_context_copy_capi_include
+    _sycl_event_copy_capi_include
+)
+
+set(python_module_name _device_queries)
+set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/src/device_queries.cpp)
+pybind11_add_module(${python_module_name} MODULE
+  ${_module_src}
+)
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
+target_include_directories(${python_module_name}
+    PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+)
+add_dependencies(${python_module_name} _dpctl4pybind11_header_ready)
+install(TARGETS ${python_module_name} DESTINATION "dpctl/utils")
diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py
index 671564cda5..fb41b3b74c 100644
--- a/dpctl/utils/__init__.py
+++ b/dpctl/utils/__init__.py
@@ -18,18 +18,85 @@
 A collection of utility functions.
 """
 
+from .._sycl_device import SyclDevice
 from ._compute_follows_data import (
     ExecutionPlacementError,
     get_coerced_usm_type,
     get_execution_queue,
     validate_usm_type,
 )
+from ._device_queries import (
+    intel_device_info_device_id,
+    intel_device_info_gpu_eu_count,
+    intel_device_info_gpu_eu_count_per_subslice,
+    intel_device_info_gpu_eu_simd_width,
+    intel_device_info_gpu_hw_threads_per_eu,
+    intel_device_info_gpu_slices,
+    intel_device_info_gpu_subslices_per_slice,
+    intel_device_info_max_mem_bandwidth,
+)
 from ._onetrace_context import onetrace_enabled
 
+
+def intel_device_info(dev):
+    """intel_device_info(sycl_device)
+
+    For Intel(R) GPU devices returns a dictionary
+    with device architectural details, and an empty
+    dictionary otherwise. The dictionary contains
+    the following keys:
+
+        device_id: 32-bits device PCI identifier
+        gpu_eu_count: Total number of execution units
+        gpu_hw_threads_per_eu: Number of thread contexts in EU
+        gpu_eu_simd_width: Physical SIMD width of EU
+        gpu_slices: Total number of slices
+        gpu_subslices_per_slice: Number of sub-slices per slice
+        gpu_eu_count_per_subslice: Number of EUs in subslice
+        max_mem_bandwidth: Maximum memory bandwidth in bytes/second
+
+    Unsupported descriptors are omitted from the dictionary.
+    Descriptors other than PCI identifier are supported only for
+    SyclDevices with Leve-Zero backend.
+    """
+    if not isinstance(dev, SyclDevice):
+        raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}")
+    dev_id = intel_device_info_device_id(dev)
+    if dev_id:
+        res = {
+            "device_id": dev_id,
+        }
+        if dev.has_aspect_gpu:
+            eu_count = intel_device_info_gpu_eu_count(dev)
+            if eu_count:
+                res["gpu_eu_count"] = eu_count
+            hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev)
+            if hw_threads:
+                res["gpu_hw_threads_per_eu"] = hw_threads
+            simd_w = intel_device_info_gpu_eu_simd_width(dev)
+            if simd_w:
+                res["gpu_eu_simd_width"] = simd_w
+            n_slices = intel_device_info_gpu_slices(dev)
+            if n_slices:
+                res["gpu_slices"] = n_slices
+            n_subslices = intel_device_info_gpu_subslices_per_slice(dev)
+            if n_subslices:
+                res["gpu_subslices_per_slice"] = n_subslices
+            n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev)
+            if n_eu_per_subslice:
+                res["gpu_eu_count_per_subslice"] = n_eu_per_subslice
+        bw = intel_device_info_max_mem_bandwidth(dev)
+        if bw:
+            res["max_mem_bandwidth"] = bw
+        return res
+    return dict()
+
+
 __all__ = [
     "get_execution_queue",
     "get_coerced_usm_type",
     "validate_usm_type",
     "onetrace_enabled",
+    "intel_device_info",
     "ExecutionPlacementError",
 ]
diff --git a/dpctl/utils/src/device_queries.cpp b/dpctl/utils/src/device_queries.cpp
new file mode 100644
index 0000000000..6407e69dbb
--- /dev/null
+++ b/dpctl/utils/src/device_queries.cpp
@@ -0,0 +1,139 @@
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace
+{
+
+std::uint32_t py_intel_device_id(const sycl::device &d)
+{
+    static constexpr std::uint32_t device_id_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_device_id)) {
+        return d.get_info<sycl::ext::intel::info::device::device_id>();
+    }
+
+    return device_id_unavailable;
+}
+
+std::uint32_t py_intel_gpu_eu_count(const sycl::device &d)
+{
+    static constexpr std::uint32_t eu_count_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+        return d.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
+    }
+
+    return eu_count_unavailable;
+}
+
+std::uint32_t py_intel_gpu_hw_threads_per_eu(const sycl::device &d)
+{
+    static constexpr std::uint32_t thread_count_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+        return d
+            .get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
+    }
+
+    return thread_count_unavailable;
+}
+
+std::uint32_t py_intel_gpu_eu_simd_width(const sycl::device &d)
+{
+    static constexpr std::uint32_t width_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_gpu_eu_simd_width)) {
+        return d.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>();
+    }
+
+    return width_unavailable;
+}
+
+std::uint32_t py_intel_gpu_slices(const sycl::device &d)
+{
+    static constexpr std::uint32_t count_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_gpu_slices)) {
+        return d.get_info<sycl::ext::intel::info::device::gpu_slices>();
+    }
+
+    return count_unavailable;
+}
+
+std::uint32_t py_intel_gpu_subslices_per_slice(const sycl::device &d)
+{
+    static constexpr std::uint32_t count_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
+        return d.get_info<
+            sycl::ext::intel::info::device::gpu_subslices_per_slice>();
+    }
+
+    return count_unavailable;
+}
+
+std::uint32_t py_intel_gpu_eu_count_per_subslice(const sycl::device &d)
+{
+    static constexpr std::uint32_t count_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice)) {
+        return d.get_info<
+            sycl::ext::intel::info::device::gpu_eu_count_per_subslice>();
+    }
+
+    return count_unavailable;
+}
+
+std::uint64_t py_intel_max_mem_bandwidth(const sycl::device &d)
+{
+    static constexpr std::uint64_t bandwidth_unavailable = 0;
+
+    if (d.has(sycl::aspect::ext_intel_max_mem_bandwidth)) {
+        return d.get_info<sycl::ext::intel::info::device::max_mem_bandwidth>();
+    }
+
+    return bandwidth_unavailable;
+}
+
+}; // namespace
+
+PYBIND11_MODULE(_device_queries, m)
+{
+    m.def("intel_device_info_device_id", &py_intel_device_id,
+          "Get ext_intel_device_id for the device, zero if not an intel device",
+          py::arg("device"));
+
+    m.def("intel_device_info_gpu_eu_count", &py_intel_gpu_eu_count,
+          "Returns the number of execution units (EUs) associated with the "
+          "Intel GPU.",
+          py::arg("device"));
+
+    m.def("intel_device_info_gpu_hw_threads_per_eu",
+          &py_intel_gpu_hw_threads_per_eu,
+          "Returns the number of hardware threads in EU.", py::arg("device"));
+
+    m.def("intel_device_info_gpu_eu_simd_width", &py_intel_gpu_eu_simd_width,
+          "Returns the physical SIMD width of the execution unit (EU).",
+          py::arg("device"));
+
+    m.def("intel_device_info_gpu_slices", &py_intel_gpu_slices,
+          "Returns the number of slices in the GPU device, or zero.",
+          py::arg("device"));
+
+    m.def("intel_device_info_gpu_subslices_per_slice",
+          &py_intel_gpu_subslices_per_slice,
+          "Returns the number of subslices per slice.", py::arg("device"));
+
+    m.def("intel_device_info_gpu_eu_count_per_subslice",
+          &py_intel_gpu_eu_count_per_subslice,
+          "Returns the number of EUs per subslice of GPU.", py::arg("device"));
+
+    m.def("intel_device_info_max_mem_bandwidth", &py_intel_max_mem_bandwidth,
+          "Returns the maximum memory bandwidth in units of bytes/second.",
+          py::arg("device"));
+}
diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt
index ce231fad4a..c8679ab73a 100644
--- a/examples/pybind11/external_usm_allocation/CMakeLists.txt
+++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt
@@ -1,10 +1,11 @@
-cmake_minimum_required(VERSION 3.21)
+cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR)
 
-project(external_usm_allocation LANGUAGES CXX)
+project(external_usm_allocation VERSION 0.1 LANGUAGES CXX
+  DESCRIPTION "Example of passing external C++ USM allocation to Python")
 
 set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
-find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
@@ -13,20 +14,23 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 include(FetchContent)
 FetchContent_Declare(
   pybind11
-  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz
-  URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae
+  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz
+  URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c
 )
 FetchContent_MakeAvailable(pybind11)
 
-find_package(PythonExtensions REQUIRED)
+find_package(Python REQUIRED COMPONENTS Development.Module NumPy)
 find_package(Dpctl REQUIRED)
-find_package(NumPy REQUIRED)
 
 set(py_module_name _external_usm_alloc)
+set(_sources
+  external_usm_allocation/_usm_alloc_example.cpp
+)
 pybind11_add_module(${py_module_name}
     MODULE
-    external_usm_allocation/_usm_alloc_example.cpp
+    ${_sources}
 )
+add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources})
 target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
 install(TARGETS ${py_module_name}
   DESTINATION external_usm_allocation
diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt
index 25589e4202..eb70b22982 100644
--- a/examples/pybind11/onemkl_gemv/CMakeLists.txt
+++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt
@@ -1,9 +1,10 @@
-cmake_minimum_required(VERSION 3.22 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.22...3.27 FATAL_ERROR)
 
-project(example_use_mkl_gemm LANGUAGES CXX)
+project(example_use_mkl_gemm VERSION 0.1 LANGUAGES CXX
+  DESCRIPTION "Example of using Python wrapper to oneMKL function")
 set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
-find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
 
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
@@ -17,12 +18,12 @@ include(GNUInstallDirs)
 include(FetchContent)
 FetchContent_Declare(
   pybind11
-  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz
-  URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae
+  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz
+  URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c
 )
 FetchContent_MakeAvailable(pybind11)
 
-find_package(PythonExtensions REQUIRED)
+find_package(Python REQUIRED COMPONENTS Development.Module NumPy)
 find_package(Dpctl REQUIRED)
 
 find_library(mkl_core NAMES mkl_core PATHS ${MKL_LIBRARY_DIR} REQUIRED)
@@ -34,10 +35,12 @@ find_library(OpenCL NAMES OpenCL REQUIRED)
 
 set(py_module_name _onemkl)
 
+set(_sources sycl_gemm/_onemkl.cpp)
 pybind11_add_module(${py_module_name}
     MODULE
-    sycl_gemm/_onemkl.cpp
+    ${_sources}
 )
+add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources})
 target_compile_definitions(${py_module_name} PRIVATE -DMKL_ILP64)
 target_include_directories(${py_module_name}
     PUBLIC ${MKL_INCLUDE_DIR} sycl_gemm
@@ -49,11 +52,14 @@ target_link_libraries(${py_module_name}
 install(TARGETS ${py_module_name} DESTINATION sycl_gemm)
 target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
 
-get_target_property(_sycl_gemm_sources ${py_module_name} SOURCES)
-set_source_files_properties(${_sycl_gemm_sources}
-  PROPERTIES
-  COMPILE_OPTIONS "-O3"
-)
+foreach(_src_fn ${_sources})
+    get_source_file_property(_compile_options ${_src_fn} COMPILE_OPTIONS)
+    set(_combined_options ${_compile_options} "-O3")
+    set_source_files_properties(${_src_fn}
+        PROPERTIES
+        COMPILE_OPTIONS "${_combined_options}"
+    )
+endforeach()
 target_link_options(${py_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
 
 add_executable(standalone_cpp
diff --git a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt
index f246d29924..ec33b2e153 100644
--- a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt
+++ b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt
@@ -1,10 +1,11 @@
-cmake_minimum_required(VERSION 3.21)
+cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR)
 
-project(use_queue_device LANGUAGES CXX)
+project(use_queue_device VERSION 0.1 LANGUAGES CXX
+  DESCRIPTION "Example of using dpctl.program.SyclKernel <-> sycl::kernel type casting")
 
 set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
-find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
@@ -19,15 +20,16 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(pybind11)
 
-find_package(PythonExtensions REQUIRED)
+find_package(Python REQUIRED COMPONENTS Development.Module NumPy)
 find_package(Dpctl REQUIRED)
-find_package(NumPy REQUIRED)
 
 set(py_module_name _use_kernel)
+set(_sources use_kernel/_example.cpp)
 pybind11_add_module(${py_module_name}
     MODULE
-    use_kernel/_example.cpp
+    ${_sources}
 )
+add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources})
 target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
 install(TARGETS ${py_module_name}
   DESTINATION use_kernel
diff --git a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt
index f7b843d7f5..827388fae1 100644
--- a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt
+++ b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt
@@ -1,10 +1,11 @@
-cmake_minimum_required(VERSION 3.21)
+cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR)
 
-project(use_queue_device LANGUAGES CXX)
+project(use_queue_device VERSION 0.1 LANGUAGES CXX
+  DESCRIPTION "Example of using dpctl.SyclQueue <-> sycl::queue type caster")
 
 set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
-find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
@@ -13,20 +14,21 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 include(FetchContent)
 FetchContent_Declare(
   pybind11
-  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz
-  URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae
+  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz
+  URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c
 )
 FetchContent_MakeAvailable(pybind11)
 
-find_package(PythonExtensions REQUIRED)
+find_package(Python REQUIRED COMPONENTS Development.Module NumPy)
 find_package(Dpctl REQUIRED)
-find_package(NumPy REQUIRED)
 
 set(py_module_name _use_queue_device)
+set(_sources use_queue_device/_example.cpp)
 pybind11_add_module(${py_module_name}
     MODULE
-    use_queue_device/_example.cpp
+    ${_sources}
 )
+add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources})
 target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
 install(TARGETS ${py_module_name}
   DESTINATION use_queue_device
diff --git a/examples/python/sycl_timer.py b/examples/python/sycl_timer.py
index f4b1416784..8ae49fd60d 100644
--- a/examples/python/sycl_timer.py
+++ b/examples/python/sycl_timer.py
@@ -15,14 +15,27 @@
 # limitations under the License.
 
 
-import dpnp
 import numpy as np
 
 import dpctl
 import dpctl.tensor as dpt
 from dpctl import SyclTimer
 
-n = 4000
+
+def matmul(m1, m2):
+    """Naive matrix multiplication implementation"""
+    assert m1.ndim == 2
+    assert m2.ndim == 2
+    assert m1.shape[1] == m2.shape[0]
+    m1 = m1[:, dpt.newaxis, :]
+    m2 = dpt.permute_dims(m2, (1, 0))[dpt.newaxis, :, :]
+    # form m_prod[i, j, k] = m1[i,k] * m2[k, j]
+    m_prods = m1 * m2
+    # sum over k
+    return dpt.sum(m_prods, axis=-1)
+
+
+n = 500
 
 try:
     q = dpctl.SyclQueue(property="enable_profiling")
@@ -33,32 +46,36 @@
     )
     exit(0)
 
-a = dpt.reshape(dpt.arange(n * n, dtype=np.float32, sycl_queue=q), (n, n))
-b = dpt.reshape(
-    dpt.asarray(np.random.random(n * n), dtype=np.float32, sycl_queue=q), (n, n)
-)
+a_flat = dpt.arange(n * n, dtype=dpt.float32, sycl_queue=q)
+a = dpt.reshape(a_flat, (n, n))
 
-timer = SyclTimer(time_scale=1)
+b_rand = np.random.random(n * n).astype(np.float32)
+b_flat = dpt.asarray(b_rand, dtype=dpt.float32, sycl_queue=q)
+b = dpt.reshape(b_flat, (n, n))
 
 wall_times = []
 device_times = []
+
 print(
-    f"Performing matrix multiplication of two {n} by {n} matrices "
+    f"Computing naive matrix multiplication of two {n} by {n} matrices "
     f"on {q.sycl_device.name}, repeating 5 times."
 )
+print()
 for _ in range(5):
+    timer = SyclTimer(time_scale=1)
     with timer(q):
-        a_matmul_b = dpnp.matmul(a, b)
+        a_matmul_b = matmul(a, b)
     host_time, device_time = timer.dt
     wall_times.append(host_time)
     device_times.append(device_time)
 
-c = dpnp.asnumpy(a_matmul_b)
-cc = np.dot(dpnp.asnumpy(a), dpnp.asnumpy(b))
+c = dpt.asnumpy(a_matmul_b)
+cc = np.dot(dpt.asnumpy(a), dpt.asnumpy(b))
 
 print("Wall time: ", wall_times, "\nDevice time: ", device_times)
+print()
 print(
     "Accuracy test: passed."
     if np.allclose(c, cc)
-    else (f"Accuracy test: failed. Discrepancy {np.max(np.abs(c-cc))}")
+    else (f"Accuracy test: FAILED. \n   Discrepancy = {np.max(np.abs(c-cc))}")
 )
diff --git a/libsyclinterface/CMakeLists.txt b/libsyclinterface/CMakeLists.txt
index 01b0321064..64ec3271b1 100644
--- a/libsyclinterface/CMakeLists.txt
+++ b/libsyclinterface/CMakeLists.txt
@@ -11,8 +11,8 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/mo
 
 find_package(Git REQUIRED)
 
-if(NOT DEFINED IntelDPCPP_FOUND OR NOT IntelDPCPP_FOUND)
-   find_package(IntelDPCPP REQUIRED)
+if(NOT DEFINED IntelSYCL_FOUND OR NOT IntelSYCL_FOUND)
+   find_package(IntelSYCL REQUIRED)
 endif()
 
 # Option to turn on support for creating Level Zero interoperability programs
@@ -43,11 +43,10 @@ option(DPCTL_ENABLE_GLOG
 )
 
 # Minimum version requirement only when oneAPI dpcpp is used.
-find_package(IntelDPCPP REQUIRED)
 if(DPCTL_DPCPP_FROM_ONEAPI)
-    find_package(IntelSycl 2021.3.0 REQUIRED)
+    find_package(IntelSyclCompiler 2021.3.0 REQUIRED)
 else()
-    find_package(IntelSycl REQUIRED)
+    find_package(IntelSyclCompiler REQUIRED)
 endif()
 
 if(DPCTL_ENABLE_L0_PROGRAM_CREATION)
@@ -57,7 +56,7 @@ if(DPCTL_ENABLE_L0_PROGRAM_CREATION)
     if (UNIX)
         find_library(PI_LEVEL_ZERO_LIB
             NAMES pi_level_zero
-            HINTS ${IntelSycl_LIBRARY_DIR}
+            HINTS ${IntelSyclCompiler_LIBRARY_DIR}
         )
         find_program(READELF_PROG readelf)
         find_program(GREP_PROG grep)
@@ -77,7 +76,7 @@ endif()
 if (UNIX)
   find_library(PI_OPENCL_LIB
       NAMES pi_opencl
-      HINTS ${IntelSycl_LIBRARY_DIR}
+      HINTS ${IntelSyclCompiler_LIBRARY_DIR}
   )
   find_program(READELF_PROG readelf)
   find_program(GREP_PROG grep)
@@ -157,7 +156,6 @@ elseif(UNIX)
     string(CONCAT CXXFLAGS
         "${WARNING_FLAGS}"
         "${SDL_FLAGS}"
-        "-fsycl "
     )
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXFLAGS}")
@@ -206,6 +204,7 @@ add_library(DPCTLSyclInterface
     ${sources}
     ${helper_sources}
 )
+add_sycl_to_target(TARGET DPCTLSyclInterface SOURCES ${sources} ${helper_sources})
 
 if(DPCTL_GENERATE_COVERAGE)
     target_link_options(DPCTLSyclInterface
@@ -222,9 +221,6 @@ target_include_directories(DPCTLSyclInterface
     ${CMAKE_CURRENT_SOURCE_DIR}/helper/include/
     ${SYCL_INCLUDE_DIR}
 )
-target_link_libraries(DPCTLSyclInterface
-  PRIVATE ${IntelSycl_SYCL_LIBRARY}
-)
 
 if(DPCTL_ENABLE_GLOG)
     find_package(glog REQUIRED)
diff --git a/libsyclinterface/cmake/modules/FindIntelSycl.cmake b/libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake
similarity index 64%
rename from libsyclinterface/cmake/modules/FindIntelSycl.cmake
rename to libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake
index 84e8946fea..45bb4f583f 100644
--- a/libsyclinterface/cmake/modules/FindIntelSycl.cmake
+++ b/libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake
@@ -19,21 +19,23 @@
 #
 # Example usage:
 #
-# find_package(IntelSycl)
+# find_package(IntelSyclCompiler)
 #
 # If successful, the following variables will be defined:
-# IntelSycl_FOUND
-# IntelSycl_VERSION
-# IntelSycl_INCLUDE_DIR
-# IntelSycl_C_COMPILER
-# IntelSycl_CXX_COMPILER
-# IntelSycl_SYCL_INCLUDE_DIR
-# IntelSycl_LIBRARY_DIR
-# IntelSycl_SYCL_LIBRARY
-# IntelSycl_OPENCL_LIBRARY
+# IntelSyclCompiler_FOUND
+# IntelSyclCompiler_VERSION
+# IntelSyclCompiler_INCLUDE_DIR
+# IntelSyclCompiler_C_COMPILER
+# IntelSyclCompiler_CXX_COMPILER
+# IntelSyclCompiler_SYCL_INCLUDE_DIR
+# IntelSyclCompiler_LIBRARY_DIR
+# IntelSyclCompiler_SYCL_LIBRARY
+# IntelSyclCompiler_OPENCL_LIBRARY
 
 include(FindPackageHandleStandardArgs)
-find_package(IntelDPCPP REQUIRED)
+if(NOT DEFINED IntelSYCL_FOUND OR NOT IntelSYCL_FOUND)
+   find_package(IntelSYCL REQUIRED)
+endif()
 
 # We will extract the version information from the compiler
 set(clangxx_cmd "${CMAKE_CXX_COMPILER}")
@@ -91,78 +93,78 @@ execute_process(
 
 # If dpcpp is found then set the package variables
 if(${clangxx_result} MATCHES "0")
-    string(REPLACE "\n" ";" IntelSycl_VERSION_LIST "${clangxx_ver}")
+    string(REPLACE "\n" ";" IntelSyclCompiler_VERSION_LIST "${clangxx_ver}")
     set(IDX 0)
-    foreach(X ${IntelSycl_VERSION_LIST})
+    foreach(X ${IntelSyclCompiler_VERSION_LIST})
         message(STATUS "dpcpp ver[${IDX}]: ${X}")
         MATH(EXPR IDX "${IDX}+1")
     endforeach()
-    list(GET IntelSycl_VERSION_LIST 0 VERSION_STRING)
+    list(GET IntelSyclCompiler_VERSION_LIST 0 VERSION_STRING)
 
     # Get the dpcpp version
     string(REGEX MATCH
         "[0-9]+\.[0-9]+\.[0-9]+"
-        IntelSycl_VERSION
+        IntelSyclCompiler_VERSION
         ${VERSION_STRING}
     )
 
     # Split out the version into major, minor an patch
-    string(REPLACE "." ";" IntelSycl_VERSION_LIST1 "${IntelSycl_VERSION}")
-    list(GET IntelSycl_VERSION_LIST1 0 IntelSycl_VERSION_MAJOR)
-    list(GET IntelSycl_VERSION_LIST1 1 IntelSycl_VERSION_MINOR)
-    list(GET IntelSycl_VERSION_LIST1 2 IntelSycl_VERSION_PATCH)
-    set(IntelSycl_INCLUDE_DIR ${SYCL_INCLUDE_DIR})
-    set(IntelSycl_SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR}/sycl)
-    set(IntelSycl_LIBRARY_DIR ${SYCL_LIBRARY_DIR})
+    string(REPLACE "." ";" IntelSyclCompiler_VERSION_LIST1 "${IntelSyclCompiler_VERSION}")
+    list(GET IntelSyclCompiler_VERSION_LIST1 0 IntelSyclCompiler_VERSION_MAJOR)
+    list(GET IntelSyclCompiler_VERSION_LIST1 1 IntelSyclCompiler_VERSION_MINOR)
+    list(GET IntelSyclCompiler_VERSION_LIST1 2 IntelSyclCompiler_VERSION_PATCH)
+    set(IntelSyclCompiler_INCLUDE_DIR ${SYCL_INCLUDE_DIR})
+    set(IntelSyclCompiler_SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR}/sycl)
+    set(IntelSyclCompiler_LIBRARY_DIR ${SYCL_LIBRARY_DIR})
     if("x${CMAKE_SYSTEM_NAME}" STREQUAL "xWindows")
         find_file(
-            IntelSycl_SYCL_LIBRARY
+            IntelSyclCompiler_SYCL_LIBRARY
             NAMES "sycl.lib" "sycl6.lib" "sycl7.lib"
-            PATHS ${IntelSycl_LIBRARY_DIR}
+            PATHS ${IntelSyclCompiler_LIBRARY_DIR}
         )
         find_file(
-            IntelSycl_OPENCL_LIBRARY
+            IntelSyclCompiler_OPENCL_LIBRARY
             NAMES "OpenCL.lib"
-            PATHS ${IntelSycl_LIBRARY_DIR}
+            PATHS ${IntelSyclCompiler_LIBRARY_DIR}
         )
     elseif("x${CMAKE_SYSTEM_NAME}" STREQUAL "xLinux")
         find_file(
-            IntelSycl_SYCL_LIBRARY
+            IntelSyclCompiler_SYCL_LIBRARY
             NAMES "libsycl.so"
-            PATHS ${IntelSycl_LIBRARY_DIR}
+            PATHS ${IntelSyclCompiler_LIBRARY_DIR}
         )
         find_file(
-            IntelSycl_OPENCL_LIBRARY
+            IntelSyclCompiler_OPENCL_LIBRARY
             NAMES "libOpenCL.so"
-            PATHS ${IntelSycl_LIBRARY_DIR}
+            PATHS ${IntelSyclCompiler_LIBRARY_DIR}
         )
     endif()
 
 endif()
 
 # Check if a specific version of DPCPP is requested.
-if(IntelSycl_FIND_VERSION AND (DEFINED IntelSycl_VERSION))
+if(IntelSyclCompiler_FIND_VERSION AND (DEFINED IntelSyclCompiler_VERSION))
     set(VERSION_GT_FIND_VERSION FALSE)
     versions_greater_equal(
-        ${IntelSycl_VERSION}
-        ${IntelSycl_FIND_VERSION}
+        ${IntelSyclCompiler_VERSION}
+        ${IntelSyclCompiler_FIND_VERSION}
         VERSION_GT_FIND_VERSION
     )
     if(VERSION_GT_FIND_VERSION)
-        set(IntelSycl_FOUND TRUE)
+        set(IntelSyclCompiler_FOUND TRUE)
     else()
-        set(IntelSycl_FOUND FALSE)
+        set(IntelSyclCompiler_FOUND FALSE)
     endif()
 else()
-    set(IntelSycl_FOUND TRUE)
+    set(IntelSyclCompiler_FOUND TRUE)
 endif()
 
-find_package_handle_standard_args(IntelSycl DEFAULT_MSG
-    IntelSycl_FOUND
-    IntelSycl_VERSION
-    IntelSycl_INCLUDE_DIR
-    IntelSycl_SYCL_INCLUDE_DIR
-    IntelSycl_LIBRARY_DIR
-    IntelSycl_SYCL_LIBRARY
-    IntelSycl_OPENCL_LIBRARY
+find_package_handle_standard_args(IntelSyclCompiler DEFAULT_MSG
+    IntelSyclCompiler_FOUND
+    IntelSyclCompiler_VERSION
+    IntelSyclCompiler_INCLUDE_DIR
+    IntelSyclCompiler_SYCL_INCLUDE_DIR
+    IntelSyclCompiler_LIBRARY_DIR
+    IntelSyclCompiler_SYCL_LIBRARY
+    IntelSyclCompiler_OPENCL_LIBRARY
 )
diff --git a/libsyclinterface/cmake/modules/GetProjectVersion.cmake b/libsyclinterface/cmake/modules/GetProjectVersion.cmake
index c0f4ec4a6f..a863a4ee17 100644
--- a/libsyclinterface/cmake/modules/GetProjectVersion.cmake
+++ b/libsyclinterface/cmake/modules/GetProjectVersion.cmake
@@ -29,7 +29,7 @@
 # VERSION_MINOR
 # VERSION
 # SEMVER
-cmake_minimum_required( VERSION 3.14.0 )
+cmake_minimum_required(VERSION 3.14...3.27 FATAL_ERROR )
 
 function(get_version)
     # Use git describe to get latest tag name
diff --git a/libsyclinterface/include/Config/dpctl_config.h.in b/libsyclinterface/include/Config/dpctl_config.h.in
index f26fc5591b..6e3daffbed 100644
--- a/libsyclinterface/include/Config/dpctl_config.h.in
+++ b/libsyclinterface/include/Config/dpctl_config.h.in
@@ -31,7 +31,7 @@
 #define __SYCL_COMPILER_VERSION_REQUIRED 20221201L
 
 /* The DPCPP version used to build dpctl */
-#define DPCTL_DPCPP_VERSION "@IntelSycl_VERSION@"
+#define DPCTL_DPCPP_VERSION "@IntelSyclCompiler_VERSION@"
 
 #define DPCTL_LIBZE_LOADER_FILENAME "@LIBZE_LOADER_FILENAME@"
 #define DPCTL_LIBCL_LOADER_FILENAME "@LIBCL_LOADER_FILENAME@"
diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h
index 1c5e53a395..cc466fce17 100644
--- a/libsyclinterface/include/dpctl_sycl_queue_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h
@@ -294,6 +294,29 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef,
                   const void *Src,
                   size_t Count);
 
+/*!
+ * @brief C-API wrapper for ``sycl::queue::memcpy``.
+ *
+ * @param    QRef           An opaque pointer to the ``sycl::queue``.
+ * @param    Dest           An USM pointer to the destination memory.
+ * @param    Src            An USM pointer to the source memory.
+ * @param    Count          A number of bytes to copy.
+ * @param    DepEvents      A pointer to array of DPCTLSyclEventRef opaque
+ *                          pointers to dependent events.
+ * @param    DepEventsCount A number of dependent events.
+ * @return   An opaque pointer to the ``sycl::event`` returned by the
+ *           ``sycl::queue::memcpy`` function.
+ * @ingroup QueueInterface
+ */
+DPCTL_API
+__dpctl_give DPCTLSyclEventRef
+DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef,
+                            void *Dest,
+                            const void *Src,
+                            size_t Count,
+                            __dpctl_keep const DPCTLSyclEventRef *DepEvents,
+                            size_t DepEventsCount);
+
 /*!
  * @brief C-API wrapper for ``sycl::queue::prefetch``.
  *
diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
index b5a97013c2..7a159a331c 100644
--- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
@@ -543,6 +543,18 @@ DPCTLDevice_GetParentDevice(__dpctl_keep const DPCTLSyclDeviceRef DRef)
 {
     auto D = unwrap<device>(DRef);
     if (D) {
+        bool is_unpartitioned = false;
+        try {
+            auto pp =
+                D->get_info<sycl::info::device::partition_type_property>();
+            is_unpartitioned =
+                (pp == sycl::info::partition_property::no_partition);
+        } catch (std::exception const &e) {
+            error_handler(e, __FILE__, __func__, __LINE__);
+            return nullptr;
+        }
+        if (is_unpartitioned)
+            return nullptr;
         try {
             const auto &parent_D = D->get_info<info::device::parent_device>();
             return wrap<device>(new device(parent_D));
diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
index 4903b888ff..60098ae933 100644
--- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
@@ -410,9 +410,12 @@ DPCTLQueue_SubmitNDRange(__dpctl_keep const DPCTLSyclKernelRef KRef,
     try {
         e = Queue->submit([&](handler &cgh) {
             // Depend on any event that was specified by the caller.
-            if (NDepEvents)
-                for (auto i = 0ul; i < NDepEvents; ++i)
-                    cgh.depends_on(*unwrap<event>(DepEvents[i]));
+            if (DepEvents)
+                for (auto i = 0ul; i < NDepEvents; ++i) {
+                    auto ei = unwrap<event>(DepEvents[i]);
+                    if (ei)
+                        cgh.depends_on(*ei);
+                }
 
             for (auto i = 0ul; i < NArgs; ++i) {
                 // \todo add support for Sycl buffers
@@ -485,6 +488,42 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef,
     }
 }
 
+__dpctl_give DPCTLSyclEventRef
+DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef,
+                            void *Dest,
+                            const void *Src,
+                            size_t Count,
+                            const DPCTLSyclEventRef *DepEvents,
+                            size_t DepEventsCount)
+{
+    event ev;
+    auto Q = unwrap<queue>(QRef);
+    if (Q) {
+        try {
+            ev = Q->submit([&](handler &cgh) {
+                if (DepEvents)
+                    for (size_t i = 0; i < DepEventsCount; ++i) {
+                        event *ei = unwrap<event>(DepEvents[i]);
+                        if (ei)
+                            cgh.depends_on(*ei);
+                    }
+
+                cgh.memcpy(Dest, Src, Count);
+            });
+        } catch (const std::exception &ex) {
+            error_handler(ex, __FILE__, __func__, __LINE__);
+            return nullptr;
+        }
+    }
+    else {
+        error_handler("QRef passed to memcpy was NULL.", __FILE__, __func__,
+                      __LINE__);
+        return nullptr;
+    }
+
+    return wrap<event>(new event(ev));
+}
+
 __dpctl_give DPCTLSyclEventRef
 DPCTLQueue_Prefetch(__dpctl_keep DPCTLSyclQueueRef QRef,
                     const void *Ptr,
diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt
index 4cfd30338d..472e1787fa 100644
--- a/libsyclinterface/tests/CMakeLists.txt
+++ b/libsyclinterface/tests/CMakeLists.txt
@@ -21,17 +21,39 @@ foreach(tf ${spirv-test-files})
     file(COPY ${tf} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 endforeach()
 
-if(DPCTL_GENERATE_COVERAGE)
-    file(GLOB_RECURSE
-        sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-    )
+file(GLOB_RECURSE
+    sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+)
 
-    # Add all dpctl sources into a single executable so that we can run coverage
-    # analysis and generate a report.
-    add_executable(dpctl_c_api_tests
-        EXCLUDE_FROM_ALL
-        ${sources}
-    )
+# Add all dpctl sources into a single executable so that we can run coverage
+# analysis and generate a report.
+add_executable(dpctl_c_api_tests
+    EXCLUDE_FROM_ALL
+    ${sources}
+)
+add_sycl_to_target(
+  TARGET dpctl_c_api_tests
+  SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_helper.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_context_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_invalid_filters.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_subdevices.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_manager.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_selector_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_aspects.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_event_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_bundle_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_invalid_filters.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_manager.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp
+)
+
+if(DPCTL_GENERATE_COVERAGE)
     target_include_directories(dpctl_c_api_tests
         PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../helper/include"
         PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
@@ -40,7 +62,7 @@ if(DPCTL_GENERATE_COVERAGE)
         ${CMAKE_THREAD_LIBS_INIT}
         GTest::GTest
         DPCTLSyclInterface
-        ${IntelSycl_OPENCL_LIBRARY}
+        ${IntelSyclCompiler_OPENCL_LIBRARY}
         ${CMAKE_DL_LIBS}
     )
     set(object_arg "-object;")
@@ -90,13 +112,11 @@ if(DPCTL_GENERATE_COVERAGE)
         DEPENDS dpctl_c_api_tests
     )
 else()
-    file(GLOB_RECURSE sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
-    add_executable(dpctl_c_api_tests EXCLUDE_FROM_ALL ${sources})
     target_link_libraries(dpctl_c_api_tests
         ${CMAKE_THREAD_LIBS_INIT}
         GTest::GTest
         DPCTLSyclInterface
-        ${IntelSycl_OPENCL_LIBRARY}
+        ${IntelSyclCompiler_OPENCL_LIBRARY}
     )
 endif()
 
diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp
index 8d23929d39..836a87379b 100644
--- a/libsyclinterface/tests/test_sycl_queue_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp
@@ -340,6 +340,10 @@ TEST(TestDPCTLSyclQueueInterface, CheckMemOpsZeroQRef)
     ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes));
     ASSERT_FALSE(bool(ERef));
 
+    ASSERT_NO_FATAL_FAILURE(
+        ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0));
+    ASSERT_FALSE(bool(ERef));
+
     ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes));
     ASSERT_FALSE(bool(ERef));
 
@@ -391,6 +395,10 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemOpsNullPtr)
     ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes));
     ASSERT_FALSE(bool(ERef));
 
+    ASSERT_NO_FATAL_FAILURE(
+        ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0));
+    ASSERT_FALSE(bool(ERef));
+
     ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes));
     if (ERef) {
         ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
@@ -450,6 +458,38 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset)
     delete[] host_arr;
 }
 
+TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset2)
+{
+    DPCTLSyclUSMRef p = nullptr;
+    DPCTLSyclEventRef Memset_ERef = nullptr;
+    DPCTLSyclEventRef Memcpy_ERef = nullptr;
+    uint8_t val = 42;
+    size_t nbytes = 256;
+    uint8_t *host_arr = new uint8_t[nbytes];
+
+    ASSERT_FALSE(host_arr == nullptr);
+
+    ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef));
+    ASSERT_FALSE(p == nullptr);
+
+    ASSERT_NO_FATAL_FAILURE(
+        Memset_ERef = DPCTLQueue_Memset(QRef, (void *)p, val, nbytes));
+
+    ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents(
+                                QRef, host_arr, p, nbytes, &Memset_ERef, 1));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef));
+
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memset_ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef));
+
+    ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef));
+
+    for (size_t i = 0; i < nbytes; ++i) {
+        ASSERT_TRUE(host_arr[i] == val);
+    }
+    delete[] host_arr;
+}
+
 TEST(TestDPCTLSyclQueueInterface, CheckFillNullQRef)
 {
     DPCTLSyclQueueRef QRef = nullptr;
@@ -481,7 +521,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8)
 {
     using T = uint8_t;
     DPCTLSyclUSMRef p = nullptr;
-    DPCTLSyclEventRef ERef = nullptr;
+    DPCTLSyclEventRef Fill8_ERef = nullptr;
+    DPCTLSyclEventRef Memcpy_ERef = nullptr;
     T val = static_cast<T>(0xB);
     size_t nelems = 256;
     T *host_arr = new T[nelems];
@@ -492,17 +533,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8)
     ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef));
     ASSERT_FALSE(p == nullptr);
 
-    ASSERT_NO_FATAL_FAILURE(ERef =
+    ASSERT_NO_FATAL_FAILURE(Fill8_ERef =
                                 DPCTLQueue_Fill8(QRef, (void *)p, val, nelems));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
 
-    ERef = nullptr;
+    ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents(
+                                QRef, host_arr, p, nbytes, &Fill8_ERef, 1));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef));
 
-    ASSERT_NO_FATAL_FAILURE(ERef =
-                                DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill8_ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef));
 
     ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef));
 
@@ -517,7 +556,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16)
     using T = uint16_t;
 
     DPCTLSyclUSMRef p = nullptr;
-    DPCTLSyclEventRef ERef = nullptr;
+    DPCTLSyclEventRef Fill16_ERef = nullptr;
+    DPCTLSyclEventRef Memcpy_ERef = nullptr;
     T val = static_cast<T>(0xAB);
     size_t nelems = 256;
     T *host_arr = new T[nelems];
@@ -529,16 +569,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16)
     ASSERT_FALSE(p == nullptr);
 
     ASSERT_NO_FATAL_FAILURE(
-        ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+        Fill16_ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems));
 
-    ERef = nullptr;
+    ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents(
+                                QRef, host_arr, p, nbytes, &Fill16_ERef, 1));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef));
 
-    ASSERT_NO_FATAL_FAILURE(ERef =
-                                DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill16_ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef));
 
     ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef));
 
@@ -553,7 +591,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32)
     using T = uint32_t;
 
     DPCTLSyclUSMRef p = nullptr;
-    DPCTLSyclEventRef ERef = nullptr;
+    DPCTLSyclEventRef Fill32_ERef = nullptr;
+    DPCTLSyclEventRef Memcpy_ERef = nullptr;
     T val = static_cast<T>(0xABCD);
     size_t nelems = 256;
     T *host_arr = new T[nelems];
@@ -565,16 +604,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32)
     ASSERT_FALSE(p == nullptr);
 
     ASSERT_NO_FATAL_FAILURE(
-        ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+        Fill32_ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems));
 
-    ERef = nullptr;
+    ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents(
+                                QRef, host_arr, p, nbytes, &Fill32_ERef, 1));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef));
 
-    ASSERT_NO_FATAL_FAILURE(ERef =
-                                DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill32_ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef));
 
     ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef));
 
@@ -589,7 +626,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64)
     using T = uint64_t;
 
     DPCTLSyclUSMRef p = nullptr;
-    DPCTLSyclEventRef ERef = nullptr;
+    DPCTLSyclEventRef Fill64_ERef = nullptr;
+    DPCTLSyclEventRef Memcpy_ERef = nullptr;
     T val = static_cast<T>(0xABCDEF73);
     size_t nelems = 256;
     T *host_arr = new T[nelems];
@@ -601,16 +639,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64)
     ASSERT_FALSE(p == nullptr);
 
     ASSERT_NO_FATAL_FAILURE(
-        ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+        Fill64_ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems));
 
-    ERef = nullptr;
+    ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents(
+                                QRef, host_arr, p, nbytes, &Fill64_ERef, 1));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef));
 
-    ASSERT_NO_FATAL_FAILURE(ERef =
-                                DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill64_ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef));
 
     ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef));
 
@@ -639,7 +675,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128)
     using T = value128_t;
 
     DPCTLSyclUSMRef p = nullptr;
-    DPCTLSyclEventRef ERef = nullptr;
+    DPCTLSyclEventRef Fill128_ERef = nullptr;
+    DPCTLSyclEventRef Memcpy_ERef = nullptr;
     T val{static_cast<uint64_t>(0xABCDEF73), static_cast<uint64_t>(0x3746AF05)};
     size_t nelems = 256;
     T *host_arr = new T[nelems];
@@ -651,17 +688,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128)
     ASSERT_FALSE(p == nullptr);
 
     ASSERT_NO_FATAL_FAILURE(
-        ERef = DPCTLQueue_Fill128(QRef, (void *)p,
-                                  reinterpret_cast<uint64_t *>(&val), nelems));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+        Fill128_ERef = DPCTLQueue_Fill128(
+            QRef, (void *)p, reinterpret_cast<uint64_t *>(&val), nelems));
 
-    ERef = nullptr;
+    ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents(
+                                QRef, host_arr, p, nbytes, &Fill128_ERef, 1));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef));
 
-    ASSERT_NO_FATAL_FAILURE(ERef =
-                                DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef));
-    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill128_ERef));
+    ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef));
 
     ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef));
 
diff --git a/setup.py b/setup.py
index f6780633cc..eb942a71b4 100644
--- a/setup.py
+++ b/setup.py
@@ -176,9 +176,31 @@ def _get_cmdclass():
         "dpctl.utils",
     ],
     package_data={
-        "dpctl": ["tests/*.*", "tests/helper/*.py", "tests/elementwise/*.py"]
+        "dpctl": [
+            "tests/*.*",
+            "tests/helper/*.py",
+            "tests/elementwise/*.py",
+            "tests/*.pyx",
+            "tests/input_files/*",
+            "resources/cmake/*.cmake",
+            "include/*.h*",
+            "include/syclinterface/*.h*",
+            "include/syclinterface/Config/*.h",
+            "include/syclinterface/Support/*.h",
+            "tensor/libtensor/include/kernels/*.h*",
+            "tensor/libtensor/include/utils/*.h*",
+            "tensor/include/dlpack/*.*",
+            "_sycl*.h",
+            "memory/_memory*.h",
+            "program/_program*.h",
+            "tensor/_usmarray*.h",
+            "*.pxd",
+            "memory/*.pxd",
+            "tensor/*.pxd",
+            "program/*.pxd",
+        ]
     },
-    include_package_data=True,
+    include_package_data=False,
     zip_safe=False,
     setup_requires=["Cython"],
     install_requires=[